Setting R code chunk options
Loading packages and initializing
# Load required packages
library(readxl)
library(openxlsx)
library(writexl)
library(dplyr)
library(lubridate)
library(ggplot2)
library(cowplot)
library(forecast)
library(Kendall)
library(tseries)
library(outliers)
library(tidyverse)
library(smooth)
library(trend)
library(kableExtra)
library(tidyr)
library(gt) #install.packages("gt")
library(gridExtra) #install.packages("gridExtra")
library(zoo)
library(imputeTS)
# Check working directory
getwd()
## [1] "C:/Users/thepa/Documents/TSA_FinalProject/Code"
Importing and Wrangling Data
## Raw Data Set: Unemployment Rate by Age (Thousands)
# Import data set
UEAge.Thou <- read_excel(
path="../Data/Raw/UE_Age(Thousands).xlsx", sheet = "Sheet1", col_names = TRUE)
# Format data set
UEAge.Thou_Processed <- UEAge.Thou %>%
mutate(
Month = ym(sub("M", "-", Month)),
Age15to24.Thou = as.numeric(`15-24`),
Age25above.Thou = as.numeric(`25+`),
AgeTotal.Thou = as.numeric(`15+`)) %>%
rename(Country="Reference area") %>%
select(Country,Month,Age15to24.Thou, Age25above.Thou, AgeTotal.Thou) %>%
arrange(Country, Month)
## Raw Data Set: Unemployment Rate by Age (%)
# Import data set
UEAge.Per <- read_excel(
path="../Data/Raw/UE_Age(%).xlsx", sheet = "Sheet1", col_names = TRUE)
# Format data set
UEAge.Per_Processed <- UEAge.Per %>%
mutate(
Month = ym(sub("M", "-", Month)),
Age15to24.Per = as.numeric(`15-24`),
Age25above.Per = as.numeric(`25+`),
AgeTotal.Per = as.numeric(`15+`)) %>%
rename(Country="Reference area") %>%
select(Country,Month,Age15to24.Per, Age25above.Per, AgeTotal.Per) %>%
arrange(Country, Month)
## Raw Data Set: Unemployment Rate by Gender (Thousands)
# Import data set
UEGender.Thou <- read_excel(
path="../Data/Raw/UE_Gender(Thousands).xlsx", sheet = "Sheet1", col_names = TRUE)
# Format data set
UEGender.Thou_Processed <- UEGender.Thou %>%
mutate(
Month = ym(sub("M", "-", Month)),
Female.Thou = as.numeric(Female),
Male.Thou = as.numeric(Male),
Total.Thou = as.numeric(Total)) %>%
rename(Country="Reference area") %>%
select(Country,Month,Female.Thou, Male.Thou, Total.Thou) %>%
arrange(Country, Month)
## Raw Data Set: Unemployment Rate by Gender (%)
# Import data set
UEGender.Per <- read_excel(
path="../Data/Raw/UE_Gender(%).xlsx", sheet = "Sheet1", col_names = TRUE)
# Format data set
UEGender.Per_Processed <- UEGender.Per %>%
mutate(
Month = ym(sub("M", "-", Month)),
Female.Per = as.numeric(Female),
Male.Per = as.numeric(Male),
Total.Per = as.numeric(Total)) %>%
rename(Country="Reference area") %>%
select(Country,Month,Female.Per, Male.Per, Total.Per) %>%
arrange(Country, Month)
# Combine all processed data sets
UE_Countries <- UEAge.Thou_Processed %>%
left_join(UEAge.Per_Processed, by=c("Country", "Month")) %>%
left_join(UEGender.Thou_Processed, by=c("Country", "Month")) %>%
left_join(UEGender.Per_Processed, by=c("Country", "Month"))
# Extract Peru Data
Peru <- UE_Countries %>%
filter(Country == "Peru") %>%
select(-Country, AgeTotal.Per, AgeTotal.Thou) %>%
select(Month, Age15to24.Per, Age25above.Per, Female.Per, Male.Per,
Total.Per, Age15to24.Thou, Age25above.Thou, Female.Thou, Male.Thou,
Total.Thou)
# Check Missing Value
sum(is.na(Peru))
## [1] 0
# Extract US Data
US <- UE_Countries %>%
filter(Country == "United States of America",
Month >= as.Date("2001-01-01") & Month <= as.Date("2024-12-01")) %>%
select(-Country, AgeTotal.Per, AgeTotal.Thou) %>%
select(Month, Age15to24.Per, Age25above.Per, Female.Per, Male.Per,
Total.Per, Age15to24.Thou, Age25above.Thou, Female.Thou, Male.Thou,
Total.Thou)
# Check Missing Value
sum(is.na(US))
## [1] 0
# Generate global unemployment data using simple average
UE_Global <- UE_Countries %>%
group_by(Month) %>%
summarise(
Age15to24.Thou = mean(`Age15to24.Thou`, na.rm = TRUE),
Age25above.Thou = mean(`Age25above.Thou`, na.rm = TRUE),
AgeTotal.Thou = mean(`AgeTotal.Thou`, na.rm = TRUE),
Age15to24.Per = mean(`Age15to24.Thou`, na.rm = TRUE),
Age25above.Per = mean(`Age25above.Per`, na.rm = TRUE),
AgeTotal.Per = mean(`AgeTotal.Per`, na.rm = TRUE),
Female.Thou = mean(`Female.Thou`, na.rm = TRUE),
Male.Thou = mean(`Male.Thou`, na.rm = TRUE),
Total.Thou = mean(`Total.Thou`, na.rm = TRUE),
Female.Per = mean(`Female.Per`, na.rm = TRUE),
Male.Per = mean(`Male.Per`, na.rm = TRUE),
Total.Per = mean(`Total.Per`, na.rm = TRUE))
# Generate global unemployment data using weighted average
UE_Global_Weighted <- UE_Countries %>%
filter(apply(UE_Countries[, 3:14], 1, function(x) all(!is.na(x)))) %>%
group_by(Month) %>%
summarise(
Age15to24.Per = sum(Age15to24.Thou) / sum(Age15to24.Thou / Age15to24.Per),
Age25above.Per = sum(Age25above.Thou) / sum(Age25above.Thou / Age25above.Per),
Female.Per = sum(Female.Thou) / sum(Female.Thou / Female.Per),
Male.Per = sum(Male.Thou) / sum(Male.Thou / Male.Per),
Total.Per = sum(Total.Thou) / sum(Total.Thou / Total.Per),
Age15to24.Thou = mean(Age15to24.Thou, na.rm = TRUE),
Age25above.Thou = mean(Age25above.Thou, na.rm = TRUE),
Female.Thou = mean(Female.Thou, na.rm = TRUE),
Male.Thou = mean(Male.Thou, na.rm = TRUE),
Total.Thou = mean(Total.Thou, na.rm = TRUE)
)
# Generate weighted global unemployment yearly
UE_Global_Yearly <- UE_Global_Weighted %>%
mutate(Year = year(Month)) %>%
group_by(Year) %>%
summarize(across(where(is.numeric), mean)) #%>%
#filter(Year >= 2000 & Year <= 2024)
Descriptive Statistics
Global Unemployment Status
# Check total no. of countries
print(length(unique(UE_Countries$Country)))
## [1] 91
# Plot global unemployment rate for simple average vs. weighted average
UE_Global_Combined <- bind_rows(
UE_Global %>% mutate(Source = "Simple Average"),
UE_Global_Weighted %>% mutate(Source = "Weighted Average")
)
ggplot(UE_Global_Combined, aes(x = Month, y = Total.Per, color = Source)) +
geom_line() +
labs(title = "Global Unemployment Rate: Simple Vs. Weighted Average",
x = "Month", y = "Global Unemployment Rate (%)", color = "Source")

# Plots
# Global Unemployment Rate
ggplot(UE_Global_Yearly, aes(x = Year, y = Total.Per)) +
geom_line() +
labs(title = "Global Unemployment Rate",
x = "Year", y = "Global Unemployment Rate (%)")

# Global Unemployment Rate by Age
ggplot(UE_Global_Yearly, aes(x = Year)) +
geom_line(aes(y = Age15to24.Per, color = "Age 15-24")) +
geom_line(aes(y = Age25above.Per, color = "Age 25+")) +
labs(title = "Global Unemployment Rate by Age",
x = "Year", y = "Global Unemployment Rate (%)", color = "Age Group") +
scale_color_manual(values = c("Age 15-24" = "blue", "Age 25+" = "red"))

# Global Unemployment Rate by Sex
ggplot(UE_Global_Yearly, aes(x = Year)) +
geom_line(aes(y = Female.Per, color = "Female")) +
geom_line(aes(y = Male.Per, color = "Male")) +
labs(title = "Global Unemployment Rate by Sex",
x = "Year", y = "Global Unemployment Rate (%)", color = "Sex Group") +
scale_color_manual(values = c("Female" = "blue", "Male" = "red"))

Unemployment Status in Peru and United States
Starting in January 2022, Peru revised its methodology for compiling
labor statistics. As a result, caution is needed when analyzing
employment data across this period.
## Unemployment Status in Peru
p1_PE <- ggplot(Peru, aes(x = Month, y = Total.Thou)) +
geom_line(color = "blue", alpha = 1) +
labs(x = "", y = "Unemployment (Thousands)")
p2_PE <- ggplot(Peru, aes(x = Month, y = Total.Per)) +
geom_line(color = "red", alpha = 1) +
labs(x = "Year", y = "Unemployment Rate (%)")
Peru_LF <- Peru %>%
mutate(LF.Part = (Total.Thou / Total.Per) * 100)
p3_PE <- ggplot(Peru_LF, aes(x = Month, y = LF.Part)) +
geom_line(color = "orange", alpha = 1) +
labs(x = "", y = "Laborforce Participation (Thousands)")
grid.arrange(p1_PE, p2_PE, p3_PE, ncol = 3,
top = "Unemployment Status in Peru")

## US Unemployment Status in US
p1_US <- ggplot(US, aes(x = Month, y = Total.Thou)) +
geom_line(color = "blue", alpha = 1) +
labs(x = "", y = "Unemployment (Thousands)")
p2_US <- ggplot(US, aes(x = Month, y = Total.Per)) +
geom_line(color = "red", alpha = 1) +
labs(x = "Year", y = "Unemployment Rate (%)")
US_LF <- US %>%
mutate(LF.Part = (Total.Thou / Total.Per) * 100)
p3_US <- ggplot(US_LF, aes(x = Month, y = LF.Part)) +
geom_line(color = "orange", alpha = 1) +
labs(x = "", y = "Laborforce Participation (Thousands)")
# Arrange plots side by side
grid.arrange(p1_PE, p2_PE, p3_PE, ncol = 3,
top = "Unemployment Status in the US")

# Comparison of Unemployment Rate between Peru and US
UE_Comparison <- UE_Countries %>%
filter(Country %in% c("Peru", "United States of America")) %>%
mutate(Month = as.Date(Month)) %>%
filter(Month >= as.Date("2001-01-01") & Month <= as.Date("2024-12-01"))
ggplot(UE_Comparison, aes(x = Month, y = Total.Per, color = Country)) +
geom_line(size = 0.8) +
geom_point(size = 0.8, alpha = 0.7) +
theme_gray(base_size = 14) +
labs(title = "Unemployment Rate by Countries (2001-2024)",
x = "Year", y = "Unemployment Rate (%)", color = "Country") +
theme(plot.title = element_text(face = "bold", size = 16),
axis.title.y = element_text(size = 14), axis.text = element_text(size = 12),
legend.position = "right")

# Comparison of Unemployment Rate by Age Group
PE1 <- ggplot(Peru, aes(x = Month)) +
geom_line(aes(y = Age15to24.Per, color = "Age 15-24"), size = 0.8) +
geom_line(aes(y = Age25above.Per, color = "Age 25+"), size = 0.8) +
geom_point(aes(y = Age15to24.Per, color = "Age 15-24"), size = 0.8, alpha = 0.5) +
geom_point(aes(y = Age25above.Per, color = "Age 25+"), size = 0.8, alpha = 0.5) +
labs(title = "Unemployment Rate by Age Group",
x = "Year", y = "Peru Unemployment Rate (%)", color = "Age Group") +
scale_color_manual(values = c("Age 15-24" = "blue", "Age 25+" = "red")) +
theme_gray(base_size = 12) +
theme(plot.title = element_text(size = 14),
axis.title.x = element_text(size = 12), axis.title.y = element_text(size = 12),
axis.text = element_text(size = 12), legend.position = "bottom")
US1 <- ggplot(US, aes(x = Month)) +
geom_line(aes(y = Age15to24.Per, color = "Age 15-24"), size = 0.8) +
geom_line(aes(y = Age25above.Per, color = "Age 25+"), size = 0.8) +
geom_point(aes(y = Age15to24.Per, color = "Age 15-24"), size = 0.8, alpha = 0.5) +
geom_point(aes(y = Age25above.Per, color = "Age 25+"), size = 0.8, alpha = 0.5) +
labs(title = "",
x = "Year", y = "US Unemployment Rate (%)", color = "Age Group") +
scale_color_manual(values = c("Age 15-24" = "blue", "Age 25+" = "red")) +
theme_gray(base_size = 12) +
theme(plot.title = element_text(size = 14),
axis.title.x = element_text(size = 12), axis.title.y = element_text(size = 12),
axis.text = element_text(size = 12), legend.position = "bottom")
grid.arrange(PE1, US1, ncol = 2)

# Comparison of Unemployment Rate by Sex Group
sPE1 <- ggplot(Peru, aes(x = Month)) +
geom_line(aes(y = Female.Per, color = "Female"), size = 0.8) +
geom_line(aes(y = Male.Per, color = "Male"), size = 0.8) +
geom_point(aes(y = Female.Per, color = "Female"), size = 0.8, alpha = 0.5) +
geom_point(aes(y = Male.Per, color = "Male"), size = 0.8, alpha = 0.5) +
labs(title = "Unemployment Rate by Sex Group",
x = "Year", y = "Peru Unemployment Rate (%)", color = "Sex Group") +
scale_color_manual(values = c("Female" = "blue", "Male" = "red")) +
theme_gray(base_size = 12) +
theme(plot.title = element_text(size = 14),
axis.title.x = element_text(size = 12), axis.title.y = element_text(size = 12),
axis.text = element_text(size = 12), legend.position = "bottom")
sUS1 <- ggplot(US, aes(x = Month)) +
geom_line(aes(y = Female.Per, color = "Female"), size = 0.8) +
geom_line(aes(y = Male.Per, color = "Male"), size = 0.8) +
geom_point(aes(y = Female.Per, color = "Female"), size = 0.8, alpha = 0.5) +
geom_point(aes(y = Male.Per, color = "Male"), size = 0.8, alpha = 0.5) +
labs(title = "",
x = "Year", y = "US Unemployment Rate (%)", color = "Sex Group") +
scale_color_manual(values = c("Female" = "blue", "Male" = "red")) +
theme_gray(base_size = 12) +
theme(plot.title = element_text(size = 14),
axis.title.x = element_text(size = 12), axis.title.y = element_text(size = 12),
axis.text = element_text(size = 12), legend.position = "bottom")
grid.arrange(sPE1, sUS1, ncol = 2)

Additional graphs(Jisup)
Innitial plot of Peru as the perspective of the level of
unemployment
peru_unemployment <- read_excel(
path="../Data/Raw/Final Project_data_20250403_R.xlsx", sheet = "Peru_unemployment", col_names = TRUE)
# Check for missing values
summary(peru_unemployment)
## Country Month Age15to24.Thou
## Length:281 Min. :2001-04-01 00:00:00.00 Min. : 39.3
## Class :character 1st Qu.:2007-02-01 00:00:00.00 1st Qu.:136.8
## Mode :character Median :2013-01-01 00:00:00.00 Median :154.0
## Mean :2012-12-16 01:42:29.47 Mean :161.9
## 3rd Qu.:2018-11-01 00:00:00.00 3rd Qu.:177.1
## Max. :2024-09-01 00:00:00.00 Max. :379.0
## Age25above.Thou AgeTotal.Thou Age15to24.Per Age25above.Per
## Min. :119.7 Min. : 191.0 Min. : 4.90 Min. :2.600
## 1st Qu.:169.4 1st Qu.: 315.5 1st Qu.:12.10 1st Qu.:4.000
## Median :194.9 Median : 343.2 Median :13.50 Median :5.000
## Mean :238.9 Mean : 400.8 Mean :13.16 Mean :5.244
## 3rd Qu.:219.0 3rd Qu.: 380.3 3rd Qu.:15.00 3rd Qu.:6.300
## Max. :880.9 Max. :1229.5 Max. :18.80 Max. :8.300
## AgeTotal.Per Female.Thou Male.Thou Total.Thou
## Min. : 3.400 Min. : 65.9 Min. : 89.9 Min. : 191.0
## 1st Qu.: 6.000 1st Qu.:166.0 1st Qu.:142.5 1st Qu.: 315.5
## Median : 7.000 Median :184.8 Median :162.0 Median : 343.2
## Mean : 7.043 Mean :214.6 Mean :186.2 Mean : 400.8
## 3rd Qu.: 8.300 3rd Qu.:208.2 3rd Qu.:182.2 3rd Qu.: 380.3
## Max. :10.800 Max. :665.5 Max. :586.0 Max. :1229.5
## Female.Per Male.Per Total.Per ...15
## Min. : 4.000 Min. : 2.800 Min. : 3.400 Mode:logical
## 1st Qu.: 6.800 1st Qu.: 4.900 1st Qu.: 6.000 NA's:281
## Median : 8.500 Median : 5.900 Median : 7.000
## Mean : 8.272 Mean : 6.025 Mean : 7.043
## 3rd Qu.: 9.900 3rd Qu.: 7.100 3rd Qu.: 8.300
## Max. :12.800 Max. :10.000 Max. :10.800
## Employment_level ...17 unemployment rate ...19
## Min. : 1944 Mode:logical Min. : 3.965 Min. :-2.9389
## 1st Qu.: 3060 NA's:281 1st Qu.: 7.249 1st Qu.:-2.2372
## Median : 3506 Median : 8.654 Median :-1.7115
## Mean : 4722 Mean : 8.779 Mean :-1.7362
## 3rd Qu.: 3894 3rd Qu.:10.594 3rd Qu.:-1.3016
## Max. :15557 Max. :13.739 Max. :-0.5654
sum(is.na(peru_unemployment))
## [1] 562
# Check regularity of time index
diff_date <- diff(peru_unemployment$Month)
table(diff_date) # There's one missing month.
## diff_date
## 28 29 30 31 61
## 17 6 93 163 1
peru_unemployment$Month <- as.Date(peru_unemployment$Month)
# Create full monthly sequence
full_month_seq <- data.frame(Month = seq.Date(from = min(peru_unemployment$Month),to = max(peru_unemployment$Month),
by = "month"))
# Join with original to find missing months
missing_months <- full_month_seq %>%
anti_join(peru_unemployment, by = "Month")
print(missing_months) # Missing month is 2012-09-01
## Month
## 1 2012-09-01
# Fill in missing months with NA
peru_unemployment <- full_month_seq %>%
left_join(peru_unemployment, by = "Month")
# Check for outliers visually
boxplot(peru_unemployment$AgeTotal.Thou,
main = "Boxplot: Peru AgeTotal.Thou",
horizontal = TRUE,
col = "lightblue")

boxplot(peru_unemployment$AgeTotal.Per,
main = "Boxplot: Peru AgeTotal.Thou",
horizontal = TRUE,
col = "lightblue")

boxplot(peru_unemployment$Employment_level,
main = "Boxplot: Peru AgeTotal.Thou",
horizontal = TRUE,
col = "lightblue")

# Convert to time series
ts_peru_total_thou <- ts(peru_unemployment$AgeTotal.Thou,
start = c(2001, 4),
frequency = 12)
ts_peru_total_per <- ts(peru_unemployment$AgeTotal.Per,
start = c(2001, 4),
frequency = 12)
ts_peru_employment_thou <- ts(peru_unemployment$Employment_level,
start = c(2001, 4),
frequency = 12)
# interpolation of NA
ts_peru_total_thou <- na_interpolation(ts_peru_total_thou, option = "linear")
ts_peru_total_per <- na_interpolation(ts_peru_total_per, option = "linear")
ts_peru_employment_thou <- na_interpolation(ts_peru_employment_thou, option = "linear")
sum(is.na(ts_peru_total_thou))
## [1] 0
sum(is.na(ts_peru_total_per))
## [1] 0
sum(is.na(ts_peru_employment_thou))
## [1] 0
# plotting the graph
peru_plot_data <- data.frame(
Month = peru_unemployment$Month,
Unemployment_Level = as.numeric(ts_peru_total_thou),
Employment_Level = as.numeric(ts_peru_employment_thou),
Unemployment_Rate_Percent = as.numeric(ts_peru_total_per)
)
p <- ggplot(peru_plot_data, aes(x = Month)) +
geom_line(aes(y = Unemployment_Level, color = "Unemployment Level"), size = 1) +
geom_line(aes(y = Employment_Level, color = "Employment Level"), size = 1) +
geom_line(aes(y = Unemployment_Rate_Percent * 1000, color = "Unemployment Rate [%]"), size = 1) +
scale_y_continuous(
name = "Unemployment [Thousands]",
limits = c(0, 20000),
sec.axis = sec_axis(~./1000, name = "Unemployment Rate [%]", breaks = seq(0, 20, 5))
) +
scale_color_manual(
name = "Legend",
values = c(
"Unemployment Level" = "darkblue",
"Employment Level" = "darkorange",
"Unemployment Rate [%]" = "gray"
)
) +
labs(
title = "Peru: Employment, Unemployment Count and Rate Over Time",
x = "Month"
) +
theme_minimal()
print(p)

# Decompose the time series
decomp_ts_peru_total_thou <- decompose(ts_peru_total_thou)
plot(decomp_ts_peru_total_thou)

decomp_ts_peru_total_per <- decompose(ts_peru_total_per)
plot(decomp_ts_peru_total_per)

decomp_ts_peru_employment_thou <- decompose(ts_peru_employment_thou)
plot(decomp_ts_peru_employment_thou)

# ACF and PACF plots
par(mfrow = c(1, 2))
Acf(ts_peru_total_thou, lag.max = 40, main = "ACF: Peru AgeTotal.Thou")
Pacf(ts_peru_total_thou, lag.max = 40, main = "PACF: Peru AgeTotal.Thou")

par(mfrow = c(1, 2))
Acf(ts_peru_total_per, lag.max = 40, main = "ACF: Peru AgeTotal.Thou")
Pacf(ts_peru_total_per, lag.max = 40, main = "PACF: Peru AgeTotal.Thou")

par(mfrow = c(1, 2))
Acf(ts_peru_employment_thou, lag.max = 40, main = "ACF: Peru AgeTotal.Thou")
Pacf(ts_peru_employment_thou, lag.max = 40, main = "PACF: Peru AgeTotal.Thou")

# Grubbs test for outliers
grubbs.test(ts_peru_total_thou)
##
## Grubbs test for one outlier
##
## data: ts_peru_total_thou
## G = 4.25704, U = 0.93528, p-value = 0.002155
## alternative hypothesis: highest value 1229.5 is an outlier
grubbs.test(ts_peru_total_per)
##
## Grubbs test for one outlier
##
## data: ts_peru_total_per
## G = 2.40929, U = 0.97927, p-value = 1
## alternative hypothesis: highest value 10.8 is an outlier
grubbs.test(ts_peru_employment_thou)
##
## Grubbs test for one outlier
##
## data: ts_peru_employment_thou
## G = 2.91618, U = 0.96963, p-value = 0.4687
## alternative hypothesis: highest value 15556.571 is an outlier
Additional graphs(Jisup)
Innitial plot of US as the perspective of the level of
unemployment
us_unemployment <- read_excel(
path="../Data/Raw/Final Project_data_20250403_R.xlsx", sheet = "US_unemployment", col_names = TRUE)
# Check for missing values
summary(us_unemployment)
## Country Month Age15to24.Thou
## Length:925 Min. :1948-01-01 00:00:00.000 Min. : 434
## Class :character 1st Qu.:1967-04-01 00:00:00.000 1st Qu.:1497
## Mode :character Median :1986-07-01 00:00:00.000 Median :2321
## Mean :1986-07-01 19:21:20.431 Mean :2298
## 3rd Qu.:2005-10-01 00:00:00.000 3rd Qu.:2970
## Max. :2025-01-01 00:00:00.000 Max. :5005
## Age25above.Thou AgeTotal.Thou Age15to24.Per Age25above.Per
## Min. : 960 Min. : 1480 Min. : 4.50 Min. : 1.800
## 1st Qu.: 2443 1st Qu.: 4127 1st Qu.: 9.60 1st Qu.: 3.300
## Median : 4207 Median : 6579 Median :11.30 Median : 4.100
## Mean : 4304 Mean : 6602 Mean :11.59 Mean : 4.414
## 3rd Qu.: 5371 3rd Qu.: 8170 3rd Qu.:13.40 3rd Qu.: 5.200
## Max. :17687 Max. :22504 Max. :26.90 Max. :12.800
## AgeTotal.Per Female.Thou Male.Thou Total.Thou
## Min. : 2.400 Min. : 502 Min. : 824 Min. : 1480
## 1st Qu.: 4.400 1st Qu.: 1602 1st Qu.: 2470 1st Qu.: 4127
## Median : 5.500 Median : 3075 Median : 3562 Median : 6579
## Mean : 5.684 Mean : 2916 Mean : 3686 Mean : 6602
## 3rd Qu.: 6.700 3rd Qu.: 3687 3rd Qu.: 4512 3rd Qu.: 8170
## Max. :14.400 Max. :11494 Max. :11010 Max. :22504
## Female.Per Male.Per Total.Per Employment_level
## Min. : 2.600 Min. : 1.900 Min. : 2.400 Min. : 46056
## 1st Qu.: 4.800 1st Qu.: 4.200 1st Qu.: 4.400 1st Qu.: 59959
## Median : 5.700 Median : 5.200 Median : 5.500 Median : 89254
## Mean : 5.934 Mean : 5.565 Mean : 5.684 Mean : 90796
## 3rd Qu.: 6.900 3rd Qu.: 6.600 3rd Qu.: 6.700 3rd Qu.:121455
## Max. :15.700 Max. :13.300 Max. :14.400 Max. :142843
sum(is.na(us_unemployment))
## [1] 0
# 1. Filtering after 2001
us_unemployment_filtered <- us_unemployment %>%
filter(Month >= as.Date("2001-01-01"))
# 2. Plot
ggplot(us_unemployment_filtered, aes(x = Month)) +
geom_line(aes(y = AgeTotal.Thou, color = "Unemployment Level"), size = 1) +
geom_line(aes(y = Employment_level, color = "Employment Level"), size = 1) +
geom_line(aes(y = AgeTotal.Per * 7500, color = "Unemployment Rate [%]"), size = 1) +
scale_y_continuous(
name = "Employment / Unemployment [Thousands]",
limits = c(0, 150000),
sec.axis = sec_axis(~./7500, name = "Unemployment Rate [%]", breaks = seq(0, 20, 5))
) +
scale_color_manual(
name = "Legend",
values = c(
"Unemployment Level" = "darkblue",
"Employment Level" = "darkorange",
"Unemployment Rate [%]" = "gray"
)
) +
labs(
title = "US: Employment, Unemployment Count and Rate Since 2001",
x = "Month"
) +
theme_minimal()

# Check regularity of time index
diff_date <- diff(us_unemployment$Month)
table(diff_date) # There's one missing month.
## diff_date
## 28 29 30 31
## 57 20 308 539
us_unemployment$Month <- as.Date(us_unemployment$Month)
# Create full monthly sequence
full_month_seq <- data.frame(Month = seq.Date(from = min(us_unemployment$Month),to = max(us_unemployment$Month),
by = "month"))
# Join with original to find missing months
missing_months <- full_month_seq %>%
anti_join(us_unemployment, by = "Month")
print(missing_months) # no NAs
## [1] Month
## <0 rows> (or 0-length row.names)
# Check for outliers visually
boxplot(us_unemployment$AgeTotal.Thou,
main = "Boxplot: US AgeTotal.Thou",
horizontal = TRUE,
col = "lightblue")

boxplot(us_unemployment$AgeTotal.Per,
main = "Boxplot: US AgeTotal.Thou",
horizontal = TRUE,
col = "lightblue")

boxplot(us_unemployment$Employment_level,
main = "Boxplot: US AgeTotal.Thou",
horizontal = TRUE,
col = "lightblue")

# Convert to time series
ts_us_total_thou <- ts(us_unemployment$AgeTotal.Thou,
start = c(2001, 4),
frequency = 12)
ts_us_total_per <- ts(us_unemployment$AgeTotal.Per,
start = c(2001, 4),
frequency = 12)
ts_us_employment_thou <- ts(us_unemployment$Employment_level,
start = c(2001, 4),
frequency = 12)
# Decompose the time series
decomp_ts_us_total_thou <- decompose(ts_us_total_thou)
plot(decomp_ts_us_total_thou)

decomp_ts_us_total_per <- decompose(ts_us_total_per)
plot(decomp_ts_us_total_per)

decomp_ts_us_employment_thou <- decompose(ts_us_employment_thou)
plot(decomp_ts_us_employment_thou)

# ACF and PACF plots
par(mfrow = c(1, 2))
Acf(ts_us_total_thou, lag.max = 40, main = "ACF: us AgeTotal.Thou")
Pacf(ts_us_total_thou, lag.max = 40, main = "PACF: us AgeTotal.Thou")

par(mfrow = c(1, 2))
Acf(ts_us_total_per, lag.max = 40, main = "ACF: us AgeTotal.Thou")
Pacf(ts_us_total_per, lag.max = 40, main = "PACF: us AgeTotal.Thou")

par(mfrow = c(1, 2))
Acf(ts_us_employment_thou, lag.max = 40, main = "ACF: us AgeTotal.Thou")
Pacf(ts_us_employment_thou, lag.max = 40, main = "PACF: us AgeTotal.Thou")

# Grubbs test for outliers
grubbs.test(ts_us_total_thou)
##
## Grubbs test for one outlier
##
## data: ts_us_total_thou
## G = 5.18747, U = 0.97085, p-value = 8.078e-05
## alternative hypothesis: highest value 22504.1 is an outlier
grubbs.test(ts_us_total_per)
##
## Grubbs test for one outlier
##
## data: ts_us_total_per
## G = 4.98129, U = 0.97312, p-value = 0.0002467
## alternative hypothesis: highest value 14.4 is an outlier
grubbs.test(ts_us_employment_thou)
##
## Grubbs test for one outlier
##
## data: ts_us_employment_thou
## G = 1.68866, U = 0.99691, p-value = 1
## alternative hypothesis: highest value 142843.355 is an outlier
Additional graphs(Jisup)
Comparing Peru and US(Population and GDP growth)
gdp_growth <- read_excel(
path="../Data/Raw/Final Project_data_20250403_R.xlsx", sheet = "US_GDP", col_names = TRUE)
gdp_growth <- gdp_growth[244:nrow(gdp_growth), ]
pop_growth <- read_excel(
path="../Data/Raw/Final Project_data_20250403_R.xlsx", sheet = "US_Population", col_names = TRUE)
pop_growth <- pop_growth[80:nrow(pop_growth), ]
# Check for missing values
summary(gdp_growth)
## observation_date
## Min. :2007-10-01 00:00:00.00
## 1st Qu.:2012-01-01 00:00:00.00
## Median :2016-04-01 00:00:00.00
## Mean :2016-03-31 21:54:46.96
## 3rd Qu.:2020-07-01 00:00:00.00
## Max. :2024-10-01 00:00:00.00
##
## Real GDP\r\nBillions of Chained 2017 Dollars,\r\nSeasonally Adjusted Annual Rate
## Min. :16269
## 1st Qu.:17367
## Median :19057
## Mean :19302
## 3rd Qu.:20843
## Max. :23542
##
## US GDP growth Real GDP per Capita GDP per Capita Peru GDP growth
## Min. :-28.100 Min. :53017 Min. :46865 Min. :-29.200
## 1st Qu.: 1.300 1st Qu.:55490 1st Qu.:51205 1st Qu.: 1.975
## Median : 2.500 Median :58704 Median :57705 Median : 3.550
## Mean : 2.148 Mean :59629 Mean :60888 Mean : 3.837
## 3rd Qu.: 3.500 3rd Qu.:63022 3rd Qu.:66222 3rd Qu.: 5.950
## Max. : 35.200 Max. :69006 Max. :87125 Max. : 41.100
## NA's :1
sum(is.na(gdp_growth))
## [1] 1
summary(pop_growth)
## observation_date US Population US_pop_growth
## Min. :2008-01-01 00:00:00.00 Min. :30454300 Min. :0.002004
## 1st Qu.:2012-01-01 00:00:00.00 1st Qu.:31472500 1st Qu.:0.005917
## Median :2016-01-01 00:00:00.00 Median :32460900 Median :0.007781
## Mean :2016-01-01 08:28:14.11 Mean :32319418 Mean :0.007094
## 3rd Qu.:2020-01-01 00:00:00.00 3rd Qu.:33184000 3rd Qu.:0.008281
## Max. :2024-01-01 00:00:00.00 Max. :34021200 Max. :0.009437
##
## US Labor Participation rate(25+) US Employment-to-population ratio
## Min. :62.68 Min. :58.61
## 1st Qu.:63.73 1st Qu.:60.95
## Median :64.19 Median :61.46
## Mean :64.62 Mean :61.37
## 3rd Qu.:65.37 3rd Qu.:61.80
## Max. :67.37 Max. :64.27
##
## peru_pop_growth ...7 ...8 ...9
## Min. :-0.061086 Min. : NA Mode:logical Min. : NA
## 1st Qu.:-0.003392 1st Qu.: NA NA's:17 1st Qu.: NA
## Median : 0.003044 Median : NA Median : NA
## Mean :-0.002813 Mean :NaN Mean :NaN
## 3rd Qu.: 0.004503 3rd Qu.: NA 3rd Qu.: NA
## Max. : 0.028980 Max. : NA Max. : NA
## NA's :17 NA's :17
sum(is.na(pop_growth))
## [1] 51
# 2. Plot
ggplot(gdp_growth, aes(x = observation_date)) +
geom_line(aes(y = `Peru GDP growth`, color = "Peru GDP Growth"), size = 1) +
geom_line(aes(y = `US GDP growth`, color = "US GDP Growth"), size = 1) +
scale_color_manual(
name = "Legend",
values = c(
"Peru GDP Growth" = "darkblue",
"US GDP Growth" = "darkorange"
)
) +
labs(
title = "US & Peru: Quarterly GDP Growth Since 2008",
x = "Date",
y = "GDP Growth (%)"
) +
theme_minimal()

ggplot(pop_growth, aes(x = observation_date)) +
geom_line(aes(y = peru_pop_growth, color = "Peru Population Growth"), size = 1) +
geom_line(aes(y = US_pop_growth, color = "US Population Growth"), size = 1) +
scale_color_manual(
name = "Legend",
values = c(
"Peru Population Growth" = "darkblue",
"US Population Growth" = "darkorange"
)
) +
labs(
title = "US & Peru: Quarterly Population Growth Since 2008",
x = "Date",
y = "Population Growth (%)"
) +
theme_minimal()

Missing month handling
## PERU
# Check regularity of time index
diff_date <- diff(Peru$Month)
table(diff_date) # There's one missing month.
## diff_date
## 28 29 30 31 61
## 17 6 93 163 1
# Create full monthly sequence
full_month_seq <- data.frame(Month = seq.Date(from = min(Peru$Month),
to = max(Peru$Month),
by = "month"))
# Join with original to find missing months
missing_months <- full_month_seq %>%
anti_join(Peru, by = "Month")
print(missing_months) # Missing month is 2012-09-01
## Month
## 1 2012-09-01
# Fill in missing months with NA and interpolate linearly
Peru <- full_month_seq %>%
left_join(Peru, by = "Month") %>%
mutate(across(where(is.numeric), ~na.approx(.x, na.rm = FALSE)))
## US
diff_date <- diff(US$Month)
table(diff_date) # There's no missing month.
## diff_date
## 28 29 30 31
## 18 6 96 167
Summary Statistics
## PERU
# Generate Summary Statistics
summary_table <- Peru %>%
select(-Month) %>% # Exclude Month column
summarise(across(where(is.numeric),
list(Mean = ~ mean(.x),
SD = ~ sd(.x),
Min = ~ min(.x),
Max = ~ max(.x),
N = ~ sum(!is.na(.x))))) %>%
pivot_longer(everything(), names_to = c("Variable", ".value"), names_sep = "_") %>%
gt() %>%
tab_header(title = "Summary Statistics of Unemployment Data in Peru",
subtitle = "Monthly Data (2001-2024)") %>%
fmt_number(columns = 2:6, decimals = 2) %>%
cols_label(Variable = "Indicator", Mean = "Mean", SD = "Standard Deviation",
Min = "Min", Max = "Max", N = "Observations") %>%
tab_options(table.font.size = px(14),
heading.title.font.size = px(18), heading.subtitle.font.size = px(14))
# print(summary_table) # During knitting, this should not be excuted.
summary_table <- Peru %>%
select(-Month) %>%
summarise(across(where(is.numeric),
list(Mean = ~ mean(.x),
SD = ~ sd(.x),
Min = ~ min(.x),
Max = ~ max(.x),
N = ~ sum(!is.na(.x))))) %>%
pivot_longer(everything(), names_to = c("Variable", ".value"), names_sep = "_")
knitr::kable(summary_table, digits = 2, caption = "Summary Statistics of Unemployment Data in Peru")
Summary Statistics of Unemployment Data in Peru
| Age15to24.Per |
13.15 |
2.79 |
4.9 |
18.8 |
282 |
| Age25above.Per |
5.24 |
1.40 |
2.6 |
8.3 |
282 |
| Female.Per |
8.27 |
1.86 |
4.0 |
12.8 |
282 |
| Male.Per |
6.02 |
1.48 |
2.8 |
10.0 |
282 |
| Total.Per |
7.04 |
1.56 |
3.4 |
10.8 |
282 |
| Age15to24.Thou |
161.82 |
54.35 |
39.3 |
379.0 |
282 |
| Age25above.Thou |
238.57 |
150.11 |
119.7 |
880.9 |
282 |
| Female.Thou |
214.38 |
107.78 |
65.9 |
665.5 |
282 |
| Male.Thou |
186.00 |
89.20 |
89.9 |
586.0 |
282 |
| Total.Thou |
400.38 |
194.76 |
191.0 |
1229.5 |
282 |
# Check outliers
outlier(Peru)
## Month Age15to24.Per Age25above.Per Female.Per Male.Per
## 19967.0 4.9 8.3 12.8 10.0
## Total.Per Age15to24.Thou Age25above.Thou Female.Thou Male.Thou
## 10.8 379.0 880.9 665.5 586.0
## Total.Thou
## 1229.5
grubbs.test(Peru$Age15to24.Thou) # This is an outlier.
##
## Grubbs test for one outlier
##
## data: Peru$Age15to24.Thou
## G = 3.99614, U = 0.94297, p-value = 0.007181
## alternative hypothesis: highest value 379 is an outlier
grubbs.test(Peru$Age25above.Thou) # This is an outlier.
##
## Grubbs test for one outlier
##
## data: Peru$Age25above.Thou
## G = 4.2792, U = 0.9346, p-value = 0.001938
## alternative hypothesis: highest value 880.9 is an outlier
grubbs.test(Peru$Age15to24.Per)
##
## Grubbs test for one outlier
##
## data: Peru$Age15to24.Per
## G = 2.96292, U = 0.96865, p-value = 0.4013
## alternative hypothesis: lowest value 4.9 is an outlier
grubbs.test(Peru$Age25above.Per)
##
## Grubbs test for one outlier
##
## data: Peru$Age25above.Per
## G = 2.17958, U = 0.98303, p-value = 1
## alternative hypothesis: highest value 8.3 is an outlier
grubbs.test(Peru$Female.Thou) # This is an outlier.
##
## Grubbs test for one outlier
##
## data: Peru$Female.Thou
## G = 4.18539, U = 0.93744, p-value = 0.003023
## alternative hypothesis: highest value 665.5 is an outlier
grubbs.test(Peru$Male.Thou) # This is an outlier.
##
## Grubbs test for one outlier
##
## data: Peru$Male.Thou
## G = 4.48425, U = 0.92818, p-value = 0.0007077
## alternative hypothesis: highest value 586 is an outlier
grubbs.test(Peru$Female.Per)
##
## Grubbs test for one outlier
##
## data: Peru$Female.Per
## G = 2.43372, U = 0.97885, p-value = 1
## alternative hypothesis: highest value 12.8 is an outlier
grubbs.test(Peru$Male.Per)
##
## Grubbs test for one outlier
##
## data: Peru$Male.Per
## G = 2.69350, U = 0.97409, p-value = 0.9522
## alternative hypothesis: highest value 10 is an outlier
grubbs.test(Peru$Total.Thou) # This is an outlier.
##
## Grubbs test for one outlier
##
## data: Peru$Total.Thou
## G = 4.25704, U = 0.93528, p-value = 0.002155
## alternative hypothesis: highest value 1229.5 is an outlier
grubbs.test(Peru$Total.Per)
##
## Grubbs test for one outlier
##
## data: Peru$Total.Per
## G = 2.40929, U = 0.97927, p-value = 1
## alternative hypothesis: highest value 10.8 is an outlier
# Check the box plot for total unemployment
boxplot(Peru$Total.Per,
main = "Boxplot: Peru Unemployment Rate (%)",
horizontal = TRUE,
col = "lightblue")

# Find the row where Total.Per is the outlier (10.8%)
outlier_row <- Peru %>%
filter(Total.Per == max(Total.Per, na.rm = TRUE))
print(outlier_row) # Highest Value is 2005-02-01.
## Month Age15to24.Per Age25above.Per Female.Per Male.Per Total.Per
## 1 2005-02-01 17.9 8.3 12.8 9.2 10.8
## Age15to24.Thou Age25above.Thou Female.Thou Male.Thou Total.Thou
## 1 194.2 257.1 239.8 211.5 451.3
## PERU
# Generate Summary Statistics
summary_tableUS <- US %>%
select(-Month) %>% # Exclude Month column
summarise(across(where(is.numeric),
list(Mean = ~ mean(.x),
SD = ~ sd(.x),
Min = ~ min(.x),
Max = ~ max(.x),
N = ~ sum(!is.na(.x))))) %>%
pivot_longer(everything(), names_to = c("Variable", ".value"), names_sep = "_") %>%
gt() %>%
tab_header(title = "Summary Statistics of Unemployment Data in US",
subtitle = "Monthly Data (2001-2024)") %>%
fmt_number(columns = 2:6, decimals = 2) %>%
cols_label(Variable = "Indicator", Mean = "Mean", SD = "Standard Deviation",
Min = "Min", Max = "Max", N = "Observations") %>%
tab_options(table.font.size = px(14),
heading.title.font.size = px(18), heading.subtitle.font.size = px(14))
# print(summary_tableUS) # During knitting, this should not be excuted.
summary_tableUS_simple <- US %>%
select(-Month) %>%
summarise(across(where(is.numeric),
list(Mean = ~ mean(.x),
SD = ~ sd(.x),
Min = ~ min(.x),
Max = ~ max(.x),
N = ~ sum(!is.na(.x))))) %>%
pivot_longer(everything(), names_to = c("Variable", ".value"), names_sep = "_")
knitr::kable(summary_tableUS_simple, digits = 2, caption = "Summary Statistics of Unemployment Data in US")
Summary Statistics of Unemployment Data in US
| Age15to24.Per |
12.01 |
3.49 |
5.7 |
26.9 |
288 |
| Age25above.Per |
4.76 |
1.77 |
2.6 |
12.8 |
288 |
| Female.Per |
5.56 |
1.88 |
2.9 |
15.7 |
288 |
| Male.Per |
5.95 |
2.15 |
3.2 |
13.3 |
288 |
| Total.Per |
5.77 |
1.98 |
3.1 |
14.4 |
288 |
| Age15to24.Thou |
2584.03 |
739.48 |
1232.8 |
4869.8 |
288 |
| Age25above.Thou |
6368.66 |
2360.28 |
3711.0 |
17686.9 |
288 |
| Female.Thou |
4034.55 |
1344.29 |
2243.2 |
11494.3 |
288 |
| Male.Thou |
4918.15 |
1735.00 |
2842.0 |
11009.8 |
288 |
| Total.Thou |
8952.70 |
3016.38 |
5146.1 |
22504.1 |
288 |
# Check outliers
outlier(US)
## Month Age15to24.Per Age25above.Per Female.Per Male.Per
## 20058.0 26.9 12.8 15.7 13.3
## Total.Per Age15to24.Thou Age25above.Thou Female.Thou Male.Thou
## 14.4 4869.8 17686.9 11494.3 11009.8
## Total.Thou
## 22504.1
grubbs.test(US$Age15to24.Thou)
##
## Grubbs test for one outlier
##
## data: US$Age15to24.Thou
## G = 3.09104, U = 0.96659, p-value = 0.2652
## alternative hypothesis: highest value 4869.8 is an outlier
grubbs.test(US$Age25above.Thou) # This is an outlier.
##
## Grubbs test for one outlier
##
## data: US$Age25above.Thou
## G = 4.7953, U = 0.9196, p-value = 0.0001438
## alternative hypothesis: highest value 17686.9 is an outlier
grubbs.test(US$Age15to24.Per) # This is an outlier.
##
## Grubbs test for one outlier
##
## data: US$Age15to24.Per
## G = 4.26881, U = 0.93628, p-value = 0.002095
## alternative hypothesis: highest value 26.9 is an outlier
grubbs.test(US$Age25above.Per) # This is an outlier.
##
## Grubbs test for one outlier
##
## data: US$Age25above.Per
## G = 4.52998, U = 0.92825, p-value = 0.0005784
## alternative hypothesis: highest value 12.8 is an outlier
grubbs.test(US$Female.Thou) # This is an outlier.
##
## Grubbs test for one outlier
##
## data: US$Female.Thou
## G = 5.54923, U = 0.89233, p-value = 1.695e-06
## alternative hypothesis: highest value 11494.3 is an outlier
grubbs.test(US$Male.Thou)
##
## Grubbs test for one outlier
##
## data: US$Male.Thou
## G = 3.5110, U = 0.9569, p-value = 0.05618
## alternative hypothesis: highest value 11009.8 is an outlier
grubbs.test(US$Female.Per) # This is an outlier.
##
## Grubbs test for one outlier
##
## data: US$Female.Per
## G = 5.40402, U = 0.89789, p-value = 4.225e-06
## alternative hypothesis: highest value 15.7 is an outlier
grubbs.test(US$Male.Per)
##
## Grubbs test for one outlier
##
## data: US$Male.Per
## G = 3.42210, U = 0.95905, p-value = 0.07927
## alternative hypothesis: highest value 13.3 is an outlier
grubbs.test(US$Total.Thou) # This is an outlier.
##
## Grubbs test for one outlier
##
## data: US$Total.Thou
## G = 4.49260, U = 0.92943, p-value = 0.0006988
## alternative hypothesis: highest value 22504.1 is an outlier
grubbs.test(US$Total.Per) # This is an outlier.
##
## Grubbs test for one outlier
##
## data: US$Total.Per
## G = 4.3545, U = 0.9337, p-value = 0.001386
## alternative hypothesis: highest value 14.4 is an outlier
# Check the box plot for total unemployment
boxplot(US$Total.Per,
main = "Boxplot: US Unemployment Rate (%)",
horizontal = TRUE,
col = "lightblue")

# Find the row where Total.Per is the outlier (14.4%)
outlier_rowUS <- US %>%
filter(Total.Per == max(Total.Per, na.rm = TRUE))
print(outlier_rowUS) # Outlier is 2020-04-01.
## # A tibble: 1 × 11
## Month Age15to24.Per Age25above.Per Female.Per Male.Per Total.Per
## <date> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 2020-04-01 26.9 12.8 15.7 13.3 14.4
## # ℹ 5 more variables: Age15to24.Thou <dbl>, Age25above.Thou <dbl>,
## # Female.Thou <dbl>, Male.Thou <dbl>, Total.Thou <dbl>
Time Series Analysis for Peru
Step 2: Decompose the time series for Peru
# Decompose
decom_totalper_train <- decompose(ts_Peru_train[,"Total.Per"])
plot(decom_totalper_train)

# Deseason
deseas_totalper_train <- seasadj(decom_totalper_train)
plot(deseas_totalper_train)

# Run the tests on deseasoned series
print(adf.test(deseas_totalper_train, alternative = "stationary")) # It is unit root.
##
## Augmented Dickey-Fuller Test
##
## data: deseas_totalper_train
## Dickey-Fuller = -2.9546, Lag order = 6, p-value = 0.1739
## alternative hypothesis: stationary
summary(MannKendall(deseas_totalper_train)) # It has a decreasing trend.
## Score = -24109 , Var(Score) = 2199023
## denominator = 36289.99
## tau = -0.664, 2-sided pvalue =< 2.22e-16
# Run the tests on original series
print(adf.test(ts_Peru_train[,"Total.Per"], alternative = "stationary")) # It is stationary.
##
## Augmented Dickey-Fuller Test
##
## data: ts_Peru_train[, "Total.Per"]
## Dickey-Fuller = -4.0545, Lag order = 6, p-value = 0.01
## alternative hypothesis: stationary
summary(SeasonalMannKendall(ts_Peru_train[,"Total.Per"]))
## Score = -1992 , Var(Score) = 16095.33
## denominator = 2878.875
## tau = -0.692, 2-sided pvalue =< 2.22e-16
summary(smk.test(ts_Peru_train[,"Total.Per"])) # It has seasonality.
##
## Seasonal Mann-Kendall trend test (Hirsch-Slack test)
##
## data: ts_Peru_train[, "Total.Per"]
## alternative hypothesis: two.sided
##
## Statistics for individual seasons
##
## H0
## S varS tau z Pr(>|z|)
## Season 1: S = 0 -154 1249.3 -0.677 -4.329 1.5003e-05 ***
## Season 2: S = 0 -146 1254.7 -0.636 -4.094 4.2475e-05 ***
## Season 3: S = 0 -147 1251.7 -0.645 -4.127 3.6792e-05 ***
## Season 4: S = 0 -171 1427.0 -0.684 -4.500 6.7873e-06 ***
## Season 5: S = 0 -170 1430.7 -0.676 -4.468 7.8938e-06 ***
## Season 6: S = 0 -167 1427.0 -0.668 -4.394 1.1110e-05 ***
## Season 7: S = 0 -182 1430.7 -0.724 -4.785 1.7073e-06 ***
## Season 8: S = 0 -189 1431.7 -0.750 -4.969 6.7427e-07 ***
## Season 9: S = 0 -173 1429.7 -0.689 -4.549 5.3915e-06 ***
## Season 10: S = 0 -158 1254.7 -0.688 -4.432 9.3205e-06 ***
## Season 11: S = 0 -160 1254.7 -0.697 -4.489 7.1616e-06 ***
## Season 12: S = 0 -175 1253.7 -0.764 -4.914 8.9118e-07 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
# Check for any differencing needed
print(ndiffs(ts_Peru_train[,"Total.Per"]))
## [1] 1
print(ndiffs(deseas_totalper_train))
## [1] 1
Step 3: Test Time Series Models for Peru
# Seasonal Naive Model
SNAIVE_deseas_totalper <- snaive(ts_Peru_train[,"Total.Per"], h=n_for)
autoplot(SNAIVE_deseas_totalper)

checkresiduals(SNAIVE_deseas_totalper)

##
## Ljung-Box test
##
## data: Residuals from Seasonal naive method
## Q* = 560.18, df = 24, p-value < 2.2e-16
##
## Model df: 0. Total lags used: 24
# Simple Moving Average Model
SMA_deseas_totalper <- smooth::sma(y = deseas_totalper_train, h=n_for,
holdout = FALSE, silent = FALSE)
## Order 1 - 263.7536; Order 100 - 795.3006; Order 200 - 936.0019
## Order 1 - 263.7536; Order 50 - 686.2083; Order 100 - 795.3006
## Order 1 - 263.7536; Order 25 - 577.3812; Order 50 - 686.2083
## Order 1 - 263.7536; Order 13 - 477.0378; Order 25 - 577.3812
## Order 1 - 263.7536; Order 7 - 392.1558; Order 13 - 477.0378
## Order 1 - 263.7536; Order 4 - 350.4972; Order 7 - 392.1558
## Order 1 - 263.7536; Order 2 - 287.2054; Order 4 - 350.4972
## Order 12 - 465.9452

summary(SMA_deseas_totalper)
##
## Model estimated using sma() function: SMA(1)
## Response variable: y
## Distribution used in the estimation: Normal
## Loss function type: MSE; Loss function value: 0.1544
## All coefficients were provided
## Error standard deviation: 0.3936
## Sample size: 270
## Number of estimated parameters: 1
## Number of degrees of freedom: 269
## Information criteria:
## AIC AICc BIC BICc
## 263.7387 263.7536 267.3371 267.3789
checkresiduals(SMA_deseas_totalper) # Residuals are not iid.

##
## Ljung-Box test
##
## data: Residuals
## Q* = 50.942, df = 24, p-value = 0.001073
##
## Model df: 0. Total lags used: 24
# Simple Exponential Smoothing Model
SES_deseas_totalper = ses(y = deseas_totalper_train, h=n_for,
holdout = FALSE, silent = FALSE)
summary(SES_deseas_totalper)
##
## Forecast method: Simple exponential smoothing
##
## Model Information:
## Simple exponential smoothing
##
## Call:
## ses(y = deseas_totalper_train, h = n_for, holdout = FALSE, silent = FALSE)
##
## Smoothing parameters:
## alpha = 0.8211
##
## Initial states:
## l = 8.589
##
## sigma: 0.3891
##
## AIC AICc BIC
## 1005.810 1005.901 1016.606
##
## Error measures:
## ME RMSE MAE MPE MAPE MASE
## Training set -0.01630259 0.3876277 0.2960287 -0.4932815 4.548733 0.4949799
## ACF1
## Training set 0.0107995
##
## Forecasts:
## Point Forecast Lo 80 Hi 80 Lo 95 Hi 95
## Oct 2023 4.974593 4.475978 5.473208 4.212027 5.737159
## Nov 2023 4.974593 4.329420 5.619767 3.987885 5.961301
## Dec 2023 4.974593 4.210472 5.738715 3.805971 6.143216
## Jan 2024 4.974593 4.107694 5.841492 3.648786 6.300401
## Feb 2024 4.974593 4.015872 5.933314 3.508356 6.440830
## Mar 2024 4.974593 3.932107 6.017080 3.380248 6.568939
## Apr 2024 4.974593 3.854589 6.094598 3.261694 6.687492
## May 2024 4.974593 3.782099 6.167087 3.150831 6.798356
## Jun 2024 4.974593 3.713770 6.235416 3.046331 6.902856
## Jul 2024 4.974593 3.648959 6.300228 2.947210 7.001976
## Aug 2024 4.974593 3.587172 6.362015 2.852715 7.096472
## Sep 2024 4.974593 3.528021 6.421165 2.762252 7.186934
autoplot(SES_deseas_totalper)

checkresiduals(SES_deseas_totalper) # Residuals are not iid.

##
## Ljung-Box test
##
## data: Residuals from Simple exponential smoothing
## Q* = 45.154, df = 24, p-value = 0.005586
##
## Model df: 0. Total lags used: 24
# SARIMA Model
SARIMA_totalper <- auto.arima(ts_Peru_train[,"Total.Per"])
print(SARIMA_totalper)
## Series: ts_Peru_train[, "Total.Per"]
## ARIMA(3,0,4)(0,1,2)[12] with drift
##
## Coefficients:
## ar1 ar2 ar3 ma1 ma2 ma3 ma4 sma1
## 2.2233 -1.8081 0.5245 -1.4075 0.7380 -0.2152 0.2613 -0.6534
## s.e. 0.1584 0.2932 0.1466 0.1642 0.1937 0.1047 0.0837 0.0745
## sma2 drift
## -0.1570 -0.0160
## s.e. 0.0754 0.0031
##
## sigma^2 = 0.1511: log likelihood = -125.09
## AIC=272.18 AICc=273.25 BIC=311.26
SARIMA_forecast_totalper <- forecast(object = SARIMA_totalper, h=n_for)
autoplot(SARIMA_forecast_totalper)

checkresiduals(SARIMA_forecast_totalper) # Residuals are iid.

##
## Ljung-Box test
##
## data: Residuals from ARIMA(3,0,4)(0,1,2)[12] with drift
## Q* = 16.411, df = 15, p-value = 0.3553
##
## Model df: 9. Total lags used: 24
# Deaseasoned ARIMA Model
ARIMA_totalper <- auto.arima(deseas_totalper_train, max.D = 0,
max.P = 0, max.Q = 0)
print(ARIMA_totalper)
## Series: deseas_totalper_train
## ARIMA(4,1,0)
##
## Coefficients:
## ar1 ar2 ar3 ar4
## -0.1889 -0.0478 -0.2786 -0.1130
## s.e. 0.0608 0.0594 0.0595 0.0613
##
## sigma^2 = 0.1411: log likelihood = -116.43
## AIC=242.87 AICc=243.1 BIC=260.84
ARIMA_forecast_totalper <- forecast(object = ARIMA_totalper, h=n_for)
autoplot(ARIMA_forecast_totalper)

checkresiduals(ARIMA_forecast_totalper) # Residuals are iid.

##
## Ljung-Box test
##
## data: Residuals from ARIMA(4,1,0)
## Q* = 18.559, df = 20, p-value = 0.5506
##
## Model df: 4. Total lags used: 24
# STL + ETS Model
ETS_totalper <- stlf(ts_Peru_train[,"Total.Per"],h=n_for)
autoplot(ETS_totalper)

checkresiduals(ETS_totalper) # Residuals are not iid.

##
## Ljung-Box test
##
## data: Residuals from STL + ETS(A,N,N)
## Q* = 52.383, df = 24, p-value = 0.0006971
##
## Model df: 0. Total lags used: 24
# ARIMA + FOURIER Model
ARIMA_Four_fit_totalper <- auto.arima(ts_Peru_train[,"Total.Per"],
seasonal=FALSE, lambda=0,
xreg=fourier(ts_Peru_train[,"Total.Per"],
K=3))
ARIMA_Four_for_totalper <- forecast(ARIMA_Four_fit_totalper,
xreg=fourier(ts_Peru_train[,"Total.Per"],
K=3, h=n_for),
h=n_for)
autoplot(ARIMA_Four_for_totalper)

checkresiduals(ARIMA_Four_for_totalper) # Better fit

##
## Ljung-Box test
##
## data: Residuals from Regression with ARIMA(3,1,2) errors
## Q* = 18.424, df = 19, p-value = 0.4943
##
## Model df: 5. Total lags used: 24
# TBATS Model
TBATS_fit_totalper <- tbats(ts_Peru_train[,"Total.Per"])
TBATS_for_totalper <- forecast(TBATS_fit_totalper, h = n_for)
autoplot(TBATS_for_totalper)

checkresiduals(TBATS_fit_totalper) # Better fit

##
## Ljung-Box test
##
## data: Residuals from TBATS
## Q* = 21.678, df = 24, p-value = 0.5985
##
## Model df: 0. Total lags used: 24
# Neural Network Model
NN_fit_totalper <- nnetar(ts_Peru_train[,"Total.Per"],
p=3, P=0,
xreg=fourier(ts_Peru_train[,"Total.Per"], K=3))
NN_for_totalper <- forecast(NN_fit_totalper,
h=n_for,
xreg=fourier(ts_Peru_train[,"Total.Per"],
K=3,h=n_for))
autoplot(NN_for_totalper)

checkresiduals(NN_fit_totalper) # Residuals are not iid.

##
## Ljung-Box test
##
## data: Residuals from NNAR(3,5)
## Q* = 46.055, df = 24, p-value = 0.00436
##
## Model df: 0. Total lags used: 24
## State Space Exponential Smoothing Model
SSES_seas_totalper <- es(ts_Peru_train[,"Total.Per"],
model="ZZZ", h=n_for, holdout=FALSE)
plot(SSES_seas_totalper$fitted, type = "l", col = "blue", main = "Fitted Values from SSES Model")
lines(SSES_seas_totalper$forecast, col = "red")

checkresiduals(SSES_seas_totalper) # Residuals are not iid.

##
## Ljung-Box test
##
## data: Residuals
## Q* = 45.384, df = 24, p-value = 0.005245
##
## Model df: 0. Total lags used: 24
## State Space with BSM Model
SS_seas_totalper <- StructTS(ts_Peru_train[,"Total.Per"],
type="BSM",fixed=c(0.3,0.01,0.1,NA))
SS_for_totalper <- forecast(SS_seas_totalper,h=n_for)
plot(SS_for_totalper)

checkresiduals(SS_seas_totalper) # Residuals are not iid.

##
## Ljung-Box test
##
## data: Residuals from StructTS
## Q* = 122.55, df = 24, p-value = 3.442e-15
##
## Model df: 0. Total lags used: 24
Step 4: Performance check for Peru
# Check accuracy of the models
SANIVE_tpscores <- accuracy(SNAIVE_deseas_totalper$mean,ts_Peru_test[,"Total.Per"])
SMA_tpscores <- accuracy(SMA_deseas_totalper$forecast,ts_Peru_test[,"Total.Per"])
SES_tpscores <- accuracy(SES_deseas_totalper$mean,ts_Peru_test[,"Total.Per"])
SARIMA_tpscores <- accuracy(SARIMA_forecast_totalper$mean,ts_Peru_test[,"Total.Per"])
ARIMA_tpscores <- accuracy(ARIMA_forecast_totalper$mean,ts_Peru_test[,"Total.Per"])
ETS_tpscores <- accuracy(ETS_totalper$mean,ts_Peru_test[,"Total.Per"])
ARIMA_Four_tpscores <- accuracy(ARIMA_Four_for_totalper$mean,ts_Peru_test[,"Total.Per"])
TBATS_tpscores <- accuracy(TBATS_for_totalper$mean,ts_Peru_test[,"Total.Per"])
NN_tpscores <- accuracy(NN_for_totalper$mean,ts_Peru_test[,"Total.Per"])
SSES_tpscores <- accuracy(SSES_seas_totalper$forecast,ts_Peru_test[,"Total.Per"])
SS_tpscores <- accuracy(SS_for_totalper$mean,ts_Peru_test[,"Total.Per"])
# Compare the matrix
tpscores <- as.data.frame(rbind(SANIVE_tpscores, SMA_tpscores,
SES_tpscores, SARIMA_tpscores, ARIMA_tpscores,
ETS_tpscores, ARIMA_Four_tpscores, TBATS_tpscores,
NN_tpscores, SSES_tpscores, SS_tpscores)) %>%
mutate(Average = rowMeans(., na.rm = TRUE))
row.names(tpscores) <- c("SNAIVE", "SMA", "SES", "SARIMA", "ARIMA",
"ETS", "ARIMA_FOURIER", "TBATS", "NNETAR",
"SSES", "BSM")
# Choose model with lowest error
best_model_index_tp <- which.min(tpscores[,"Average"])
cat("The best model by Average is:", row.names(tpscores[best_model_index_tp,]))
## The best model by Average is: SSES
# Create Tables
kbl(tpscores,
caption = "Forecast Accuracy for Unemployment Rate (%) Data",
digits = array(5,ncol(tpscores))) %>%
kable_styling(full_width = FALSE, position = "center") %>%
#highlight model with lowest RMSE
kable_styling(latex_options="striped", stripe_index = which.min(tpscores[,"Average"]))
Forecast Accuracy for Unemployment Rate (%) Data
|
|
ME
|
RMSE
|
MAE
|
MPE
|
MAPE
|
ACF1
|
Theil’s U
|
Average
|
|
SNAIVE
|
0.64167
|
0.76974
|
0.69167
|
12.11799
|
13.16863
|
-0.01233
|
0.84418
|
4.03165
|
|
SMA
|
0.34691
|
0.78067
|
0.58421
|
5.00053
|
10.19900
|
0.22688
|
0.91989
|
2.57973
|
|
SES
|
0.31707
|
0.76788
|
0.57923
|
4.42730
|
10.15997
|
0.22688
|
0.90374
|
2.48315
|
|
SARIMA
|
0.52882
|
0.64457
|
0.52882
|
9.57292
|
9.57292
|
-0.27866
|
0.77381
|
3.04903
|
|
ARIMA
|
0.40318
|
0.80325
|
0.59973
|
6.09311
|
10.41354
|
0.20220
|
0.94873
|
2.78053
|
|
ETS
|
0.36756
|
0.52866
|
0.39184
|
6.52143
|
7.06096
|
-0.35963
|
0.64251
|
2.16476
|
|
ARIMA_FOURIER
|
0.52353
|
0.71506
|
0.56808
|
9.06052
|
10.05054
|
-0.28342
|
0.86529
|
3.07137
|
|
TBATS
|
0.44151
|
0.64266
|
0.50814
|
7.70751
|
9.18814
|
-0.39919
|
0.77704
|
2.69512
|
|
NNETAR
|
0.40880
|
0.56763
|
0.43296
|
7.28131
|
7.81823
|
-0.20078
|
0.67849
|
2.42666
|
|
SSES
|
0.31979
|
0.55243
|
0.42212
|
5.37303
|
7.63010
|
-0.41969
|
0.66813
|
2.07799
|
|
BSM
|
0.34261
|
0.49569
|
0.40944
|
6.26408
|
7.72735
|
-0.20497
|
0.57272
|
2.22956
|
# Plot everything together
autoplot(ts_Peru_test[,"Total.Per"]) +
autolayer(SNAIVE_deseas_totalper, PI=FALSE, series="SNAIVE") +
autolayer(SES_deseas_totalper, PI=FALSE, series="SES") +
autolayer(SARIMA_forecast_totalper, PI=FALSE, series="SARIMA") +
autolayer(ARIMA_forecast_totalper, PI=FALSE, series="ARIMA") +
autolayer(ETS_totalper, PI=FALSE, series="ETS") +
autolayer(ARIMA_Four_for_totalper, PI=FALSE, series="ARIMA_FOURIER") +
autolayer(TBATS_for_totalper, PI=FALSE, series="TBATS") +
autolayer(NN_for_totalper, PI=FALSE, series="NNETAR") +
autolayer(SS_for_totalper, PI=FALSE, series="BSM") +
guides(colour=guide_legend(title="Forecast")) # SMA and SSES could not run

Step 5: Forecast for 2025 with the best three models for Peru
# State Space with BSM Model
n_full = 15
# Create the time series to retain full data set
ts_Peru_fulltrain <- ts(Peru[,6],
start=c(year(Peru$Month[1]), month(Peru$Month[1])),
frequency = 12)
# Fit SSES Model
SSES_seas_totalper_fulltrain <- es(ts_Peru_fulltrain,
model="ZZZ", h=n_full, holdout=FALSE)
SSES_for_totalper_fulltrain <- forecast(SSES_seas_totalper_fulltrain, h=n_full)
# Plot model + observed data
autoplot(ts_Peru_fulltrain) +
autolayer(SSES_for_totalper_fulltrain, series="SSES",PI=FALSE)+
ylab("Forecasted Unemployment Rate (%) in Peru")

# Fit ETS Model
ETS_seas_totalper_fulltrain <- stlf(ts_Peru_fulltrain,h=n_full)
ETS_for_totalper_fulltrain <- forecast(ETS_seas_totalper_fulltrain, h=n_full)
# Plot model + observed data
autoplot(ts_Peru_fulltrain) +
autolayer(ETS_for_totalper_fulltrain, series="ETS",PI=FALSE)+
ylab("Forecasted Unemployment Rate (%) in Peru")

# Fit SS with BSM Model
SS_seas_totalper_fulltrain <- StructTS(ts_Peru_fulltrain,
type="BSM",fixed=c(0.01,0.001,0.1,NA))
SS_for_totalper_fulltrain <- forecast(SS_seas_totalper_fulltrain,h=n_full)
# Plot model + observed data
autoplot(ts_Peru_fulltrain) +
autolayer(SS_for_totalper_fulltrain, series="SS with BSM Model",PI=FALSE)+
ylab("Forecasted Unemployment Rate (%) in Peru")

# Plot 3 models together
autoplot(ts_Peru_fulltrain) +
autolayer(SSES_for_totalper_fulltrain, series="SSES",PI=FALSE)+
autolayer(ETS_for_totalper_fulltrain, series="ETS",PI=FALSE)+
autolayer(SS_for_totalper_fulltrain, series="SS with BSM Model",PI=FALSE)+
ylab("Unemployment Rate (%)") +
ggtitle("Forecasted Unemployment Rate (%) in Peru")

Step 6: The average of 3 forecasts
# --- 1. Extract predicted means ---
bsm_fc <- as.numeric(SS_for_totalper_fulltrain$mean)
sses_fc <- as.numeric(SSES_for_totalper_fulltrain$mean)
ets_fc <- as.numeric(ETS_for_totalper_fulltrain$mean)
# Compute average forecast
avg_fc <- (bsm_fc + sses_fc + ets_fc) / 3
# --- 2. Create forecast date index ---
# 마지막 월 기준 + 1개월
start_month <- as.Date("2025-01-01") # 또는 Peru$Month의 마지막 날짜 이후
forecast_dates <- seq(from = start_month, by = "month", length.out = n_full)
# --- 3. Create forecast dataframe ---
forecast_df <- tibble(
Month = forecast_dates,
BSM = bsm_fc,
SSES = sses_fc,
ETS = ets_fc,
Avg_Forecast = avg_fc
)
# --- 4. Export to Excel ---
write.xlsx(forecast_df, "../Forecast_Average_Peru.xlsx")
# --- 5. Extract actuals from 2020 ---
peru_actual_2020on <- Peru %>%
filter(Month >= as.Date("2020-01-01")) %>%
select(Month, Actual = Total.Per)
# Combine with forecast
plot_df <- bind_rows(
tibble(Month = peru_actual_2020on$Month,
Value = peru_actual_2020on$Actual,
Type = "Actual"),
tibble(Month = forecast_dates,
Value = avg_fc,
Type = "Avg Forecast")
)
# --- 6. Plot actuals + forecast ---
ggplot(plot_df, aes(x = Month, y = Value, color = Type, group = 1)) +
geom_line(size = 1) +
scale_color_manual(values = c("Actual" = "black", "Avg Forecast" = "steelblue")) +
labs(
title = "Peru Unemployment Rate: Actual (2020~) + Avg Forecast (2025)",
y = "Unemployment Rate (%)",
x = "Month",
color = "Legend"
) +
theme_minimal()

# --- 7. Extract actuals from 2022 ---
peru_actual_2022on <- Peru %>%
filter(Month >= as.Date("2022-01-01")) %>%
select(Month, Actual = Total.Per)
# Combine with forecast
plot_df <- bind_rows(
tibble(Month = peru_actual_2022on$Month,
Value = peru_actual_2022on$Actual,
Type = "Actual"),
tibble(Month = forecast_dates,
Value = avg_fc,
Type = "Avg Forecast")
)
# --- 8. Plot actuals + forecast ---
ggplot(plot_df, aes(x = Month, y = Value, color = Type, group = 1)) +
geom_line(size = 1) +
scale_color_manual(values = c("Actual" = "black", "Avg Forecast" = "steelblue")) +
labs(
title = "Peru Unemployment Rate: Actual (2022~) + Avg Forecast (2025)",
y = "Unemployment Rate (%)",
x = "Month",
color = "Legend"
) +
theme_minimal()

Time Series Analysis for US (Original Series)
Step 2: Decompose the time series for US (Original)
# Decompose
decom_totalper_trainUS <- decompose(ts_US_train[,"Total.Per"])
plot(decom_totalper_trainUS)

# Deseason
deseas_totalper_trainUS <- seasadj(decom_totalper_trainUS)
plot(deseas_totalper_trainUS)

# Run the tests on deseasoned series
print(adf.test(deseas_totalper_trainUS, alternative = "stationary")) # It is stationary.
##
## Augmented Dickey-Fuller Test
##
## data: deseas_totalper_trainUS
## Dickey-Fuller = -2.3519, Lag order = 6, p-value = 0.4278
## alternative hypothesis: stationary
summary(MannKendall(deseas_totalper_trainUS)) # It has a decreasing trend.
## Score = -10342 , Var(Score) = 2348637
## denominator = 37927.99
## tau = -0.273, 2-sided pvalue =1.5023e-11
# Run the tests on original series
print(adf.test(ts_US_train[,"Total.Per"], alternative = "stationary")) # It is stationary.
##
## Augmented Dickey-Fuller Test
##
## data: ts_US_train[, "Total.Per"]
## Dickey-Fuller = -2.3553, Lag order = 6, p-value = 0.4264
## alternative hypothesis: stationary
summary(SeasonalMannKendall(ts_US_train[,"Total.Per"]))
## Score = -876 , Var(Score) = 17157.33
## denominator = 3013.905
## tau = -0.291, 2-sided pvalue =2.2665e-11
summary(smk.test(ts_US_train[,"Total.Per"])) # It has seasonality.
##
## Seasonal Mann-Kendall trend test (Hirsch-Slack test)
##
## data: ts_US_train[, "Total.Per"]
## alternative hypothesis: two.sided
##
## Statistics for individual seasons
##
## H0
## S varS tau z Pr(>|z|)
## Season 1: S = 0 -77 1429.7 -0.307 -2.010 0.0444311 *
## Season 2: S = 0 -81 1429.7 -0.323 -2.116 0.0343627 *
## Season 3: S = 0 -81 1429.0 -0.323 -2.116 0.0343207 *
## Season 4: S = 0 -55 1431.7 -0.218 -1.427 0.1535337
## Season 5: S = 0 -51 1427.0 -0.204 -1.324 0.1856346
## Season 6: S = 0 -59 1429.0 -0.235 -1.534 0.1249545
## Season 7: S = 0 -58 1430.7 -0.231 -1.507 0.1318174
## Season 8: S = 0 -61 1429.7 -0.243 -1.587 0.1125483
## Season 9: S = 0 -72 1430.7 -0.286 -1.877 0.0605034 .
## Season 10: S = 0 -81 1433.7 -0.320 -2.113 0.0346148 *
## Season 11: S = 0 -97 1429.7 -0.386 -2.539 0.0111186 *
## Season 12: S = 0 -103 1427.0 -0.412 -2.700 0.0069308 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
# Check for any differencing needed
print(ndiffs(ts_US_train[,"Total.Per"]))
## [1] 1
print(ndiffs(deseas_totalper_trainUS))
## [1] 1
Step 3: Test Time Series Models for US (Original)
# Seasonal Naive Model
SNAIVE_deseas_totalperUS <- snaive(ts_US_train[,"Total.Per"], h=n_forUS)
autoplot(SNAIVE_deseas_totalperUS)

checkresiduals(SNAIVE_deseas_totalperUS) # Residuals are not iid.

##
## Ljung-Box test
##
## data: Residuals from Seasonal naive method
## Q* = 691.48, df = 24, p-value < 2.2e-16
##
## Model df: 0. Total lags used: 24
# Simple Moving Average Model
SMA_deseas_totalperUS <- smooth::sma(y = deseas_totalper_trainUS, h=n_forUS,
holdout = FALSE, silent = FALSE)
## Order 1 - 564.0677; Order 100 - 1176.5245; Order 200 - 1165.6369
## Order 1 - 564.0677; Order 50 - 1116.0115; Order 100 - 1176.5245
## Order 1 - 564.0677; Order 25 - 1011.42; Order 50 - 1116.0115
## Order 1 - 564.0677; Order 13 - 890.01; Order 25 - 1011.42
## Order 1 - 564.0677; Order 7 - 797.6525; Order 13 - 890.01
## Order 1 - 564.0677; Order 4 - 721.5871; Order 7 - 797.6525
## Order 1 - 564.0677; Order 2 - 631.7024; Order 4 - 721.5871
## Order 12 - 876.6067

summary(SMA_deseas_totalperUS)
##
## Model estimated using sma() function: SMA(1)
## Response variable: y
## Distribution used in the estimation: Normal
## Loss function type: MSE; Loss function value: 0.4487
## All coefficients were provided
## Error standard deviation: 0.6711
## Sample size: 276
## Number of estimated parameters: 1
## Number of degrees of freedom: 275
## Information criteria:
## AIC AICc BIC BICc
## 564.0531 564.0677 567.6735 567.7145
checkresiduals(SMA_deseas_totalperUS) # Residuals are iid.

##
## Ljung-Box test
##
## data: Residuals
## Q* = 10.428, df = 24, p-value = 0.9925
##
## Model df: 0. Total lags used: 24
# Simple Exponential Smoothing Model
SES_deseas_totalperUS = ses( y = deseas_totalper_trainUS, h=n_forUS,
holdout = FALSE, silent = FALSE)
summary(SES_deseas_totalperUS)
##
## Forecast method: Simple exponential smoothing
##
## Model Information:
## Simple exponential smoothing
##
## Call:
## ses(y = deseas_totalper_trainUS, h = n_forUS, holdout = FALSE,
## silent = FALSE)
##
## Smoothing parameters:
## alpha = 0.9999
##
## Initial states:
## l = 4.3496
##
## sigma: 0.6723
##
## AIC AICc BIC
## 1336.031 1336.119 1346.892
##
## Error measures:
## ME RMSE MAE MPE MAPE MASE
## Training set -0.001830183 0.6698351 0.228668 -0.3309908 3.598532 0.2056142
## ACF1
## Training set 0.02778021
##
## Forecasts:
## Point Forecast Lo 80 Hi 80 Lo 95 Hi 95
## Jan 2024 3.844512 2.9829569 4.706068 2.5268770 5.162148
## Feb 2024 3.844512 2.6261500 5.062875 1.9811878 5.707837
## Mar 2024 3.844512 2.3523542 5.336671 1.5624533 6.126572
## Apr 2024 3.844512 2.1215308 5.567494 1.2094395 6.479585
## May 2024 3.844512 1.9181701 5.770855 0.8984261 6.790599
## Jun 2024 3.844512 1.7343172 5.954708 0.6172473 7.071778
## Jul 2024 3.844512 1.5652465 6.123778 0.3586760 7.330349
## Aug 2024 3.844512 1.4078790 6.281146 0.1180032 7.571022
## Sep 2024 3.844512 1.2600760 6.428949 -0.1080420 7.797067
## Oct 2024 3.844512 1.1202803 6.568745 -0.3218411 8.010866
## Nov 2024 3.844512 0.9873162 6.701709 -0.5251921 8.214217
## Dec 2024 3.844512 0.8602706 6.828754 -0.7194916 8.408516
autoplot(SES_deseas_totalperUS)

checkresiduals(SES_deseas_totalperUS) # Residuals are iid.

##
## Ljung-Box test
##
## data: Residuals from Simple exponential smoothing
## Q* = 10.43, df = 24, p-value = 0.9925
##
## Model df: 0. Total lags used: 24
# SARIMA Model
SARIMA_totalperUS <- auto.arima(ts_US_train[,"Total.Per"])
print(SARIMA_totalperUS)
## Series: ts_US_train[, "Total.Per"]
## ARIMA(0,1,0)
##
## sigma^2 = 0.5215: log likelihood = -300.68
## AIC=603.35 AICc=603.37 BIC=606.97
SARIMA_forecast_totalperUS <- forecast(object = SARIMA_totalperUS, h=n_forUS)
autoplot(SARIMA_forecast_totalperUS)

checkresiduals(SARIMA_forecast_totalperUS) # Residuals are iid.

##
## Ljung-Box test
##
## data: Residuals from ARIMA(0,1,0)
## Q* = 22.736, df = 24, p-value = 0.5354
##
## Model df: 0. Total lags used: 24
# Deaseasoned ARIMA Model
ARIMA_totalperUS <- auto.arima(deseas_totalper_trainUS, max.D = 0,
max.P = 0, max.Q = 0)
print(ARIMA_totalperUS)
## Series: deseas_totalper_trainUS
## ARIMA(0,1,0)
##
## sigma^2 = 0.4503: log likelihood = -280.51
## AIC=563.01 AICc=563.03 BIC=566.63
ARIMA_forecast_totalperUS <- forecast(object = ARIMA_totalperUS, h=n_forUS)
autoplot(ARIMA_forecast_totalperUS)

checkresiduals(ARIMA_forecast_totalperUS) # Residuals are iid.

##
## Ljung-Box test
##
## data: Residuals from ARIMA(0,1,0)
## Q* = 10.428, df = 24, p-value = 0.9925
##
## Model df: 0. Total lags used: 24
# STL + ETS Model
ETS_totalperUS <- stlf(ts_US_train[,"Total.Per"],h=n_forUS)
autoplot(ETS_totalperUS)

checkresiduals(ETS_totalperUS) # Residuals are iid.

##
## Ljung-Box test
##
## data: Residuals from STL + ETS(A,N,N)
## Q* = 32.789, df = 24, p-value = 0.1086
##
## Model df: 0. Total lags used: 24
# ARIMA + FOURIER Model
ARIMA_Four_fit_totalperUS <- auto.arima(ts_US_train[,"Total.Per"],
seasonal=FALSE, lambda=0,
xreg=fourier(ts_US_train[,"Total.Per"],
K=3))
ARIMA_Four_for_totalperUS <- forecast(ARIMA_Four_fit_totalperUS,
xreg=fourier(ts_US_train[,"Total.Per"],
K=3, h=n_forUS),
h=n_forUS)
autoplot(ARIMA_Four_for_totalperUS)

checkresiduals(ARIMA_Four_for_totalperUS) # Residuals are iid.

##
## Ljung-Box test
##
## data: Residuals from Regression with ARIMA(0,1,0) errors
## Q* = 18.547, df = 24, p-value = 0.7757
##
## Model df: 0. Total lags used: 24
# TBATS Model
TBATS_fit_totalperUS <- tbats(ts_US_train[,"Total.Per"])
TBATS_for_totalperUS <- forecast(TBATS_fit_totalperUS, h = n_forUS)
autoplot(TBATS_for_totalperUS)

checkresiduals(TBATS_fit_totalperUS) # Residuals are iid.

##
## Ljung-Box test
##
## data: Residuals from TBATS
## Q* = 13.35, df = 24, p-value = 0.96
##
## Model df: 0. Total lags used: 24
# Neural Network Model
NN_fit_totalperUS <- nnetar(ts_US_train[,"Total.Per"],
p=3, P=0,
xreg=fourier(ts_US_train[,"Total.Per"], K=3))
NN_for_totalperUS <- forecast(NN_fit_totalperUS,
h=n_forUS,
xreg=fourier(ts_US_train[,"Total.Per"],
K=3,h=n_forUS))
autoplot(NN_for_totalperUS)

checkresiduals(NN_fit_totalperUS) # Residuals are iid.

##
## Ljung-Box test
##
## data: Residuals from NNAR(3,5)
## Q* = 18.361, df = 24, p-value = 0.7851
##
## Model df: 0. Total lags used: 24
## State Space Exponential Smoothing Model
SSES_seas_totalperUS <- es(ts_US_train[,"Total.Per"],
model="ZZZ", h=n_forUS, holdout=FALSE)
checkresiduals(SSES_seas_totalperUS) # Residuals are iid.

##
## Ljung-Box test
##
## data: Residuals
## Q* = 27.974, df = 24, p-value = 0.2611
##
## Model df: 0. Total lags used: 24
## State Space with BSM Model
SS_seas_totalperUS <- StructTS(ts_US_train[,"Total.Per"],
type="BSM",fixed=c(0.1,0.01,0.1,NA))
SS_for_totalperUS <- forecast(SS_seas_totalperUS,h=n_forUS)
plot(SS_for_totalperUS)

checkresiduals(SS_seas_totalperUS) # Residuals are not iid.

##
## Ljung-Box test
##
## data: Residuals from StructTS
## Q* = 127.75, df = 24, p-value = 4.441e-16
##
## Model df: 0. Total lags used: 24
Step 4: Performance check for US (Original)
# Check accuracy of the models
SANIVE_tpscoresUS <- accuracy(SNAIVE_deseas_totalperUS$mean,ts_US_test[,"Total.Per"])
SMA_tpscoresUS <- accuracy(SMA_deseas_totalperUS$forecast,ts_US_test[,"Total.Per"])
SES_tpscoresUS <- accuracy(SES_deseas_totalperUS$mean,ts_US_test[,"Total.Per"])
SARIMA_tpscoresUS <- accuracy(SARIMA_forecast_totalperUS$mean,ts_US_test[,"Total.Per"])
ARIMA_tpscoresUS <- accuracy(ARIMA_forecast_totalperUS$mean,ts_US_test[,"Total.Per"])
ETS_tpscoresUS <- accuracy(ETS_totalperUS$mean,ts_US_test[,"Total.Per"])
ARIMA_Four_tpscoresUS <- accuracy(ARIMA_Four_for_totalperUS$mean,ts_US_test[,"Total.Per"])
TBATS_tpscoresUS <- accuracy(TBATS_for_totalperUS$mean,ts_US_test[,"Total.Per"])
NN_tpscoresUS <- accuracy(NN_for_totalperUS$mean,ts_US_test[,"Total.Per"])
SSES_tpscoresUS <- accuracy(SSES_seas_totalperUS$forecast,ts_US_test[,"Total.Per"])
SS_tpscoresUS <- accuracy(SS_for_totalperUS$mean,ts_US_test[,"Total.Per"])
# Compare the matrix
tpscoresUS <- as.data.frame(rbind(SANIVE_tpscoresUS, SMA_tpscoresUS,
SES_tpscoresUS, SARIMA_tpscoresUS, ARIMA_tpscoresUS,
ETS_tpscoresUS, ARIMA_Four_tpscoresUS, TBATS_tpscoresUS,
NN_tpscoresUS, SSES_tpscoresUS, SS_tpscoresUS)) %>%
mutate(Average = rowMeans(., na.rm = TRUE))
row.names(tpscoresUS) <- c("SNAIVE", "SMA", "SES", "SARIMA", "ARIMA",
"ETS", "ARIMA_FOURIER", "TBATS", "NNETAR",
"SSES", "BSM")
# Choose model with lowest error
best_model_index_tpUS <- which.min(tpscoresUS[,"Average"])
cat("The best model by Average is:", row.names(tpscoresUS[best_model_index_tpUS,]))
## The best model by Average is: BSM
# Create Tables
kbl(tpscoresUS,
caption = "Forecast Accuracy for Unemployment Rate (%) Data",
digits = array(5,ncol(tpscoresUS))) %>%
kable_styling(full_width = FALSE, position = "center") %>%
#highlight model with lowest RMSE
kable_styling(latex_options="striped", stripe_index = which.min(seas_scores[,"Average"]))
Forecast Accuracy for Unemployment Rate (%) Data
|
|
ME
|
RMSE
|
MAE
|
MPE
|
MAPE
|
ACF1
|
Theil’s U
|
Average
|
|
SNAIVE
|
0.38333
|
0.40620
|
0.38333
|
9.46470
|
9.46470
|
0.28333
|
1.35284
|
3.10549
|
|
SMA
|
0.17216
|
0.33070
|
0.26108
|
3.80965
|
6.29631
|
0.44396
|
1.06312
|
1.76814
|
|
SES
|
0.17215
|
0.33070
|
0.26108
|
3.80953
|
6.29625
|
0.44396
|
1.06311
|
1.76811
|
|
SARIMA
|
0.51667
|
0.58878
|
0.51667
|
12.42930
|
12.42930
|
0.44396
|
1.85999
|
4.11210
|
|
ARIMA
|
0.17216
|
0.33070
|
0.26108
|
3.80965
|
6.29631
|
0.44396
|
1.06312
|
1.76814
|
|
ETS
|
-0.09918
|
0.54307
|
0.39400
|
-2.91283
|
10.30378
|
0.55949
|
1.95874
|
1.53529
|
|
ARIMA_FOURIER
|
0.40391
|
0.45420
|
0.40391
|
9.80344
|
9.80344
|
0.50899
|
1.46803
|
3.26370
|
|
TBATS
|
0.32666
|
0.38544
|
0.34616
|
7.88226
|
8.43937
|
0.53705
|
1.26470
|
2.74024
|
|
NNETAR
|
-5.32541
|
7.10765
|
5.35607
|
-130.13286
|
130.86290
|
0.80345
|
23.01979
|
4.52737
|
|
SSES
|
1.52521
|
1.71249
|
1.52948
|
38.05395
|
38.15815
|
0.73123
|
5.77401
|
12.49779
|
|
BSM
|
-0.72078
|
0.78299
|
0.72078
|
-18.28609
|
18.28609
|
0.56379
|
2.74789
|
0.58495
|
# Plot everything together
autoplot(ts_US_test[,"Total.Per"]) +
autolayer(SNAIVE_deseas_totalperUS, PI=FALSE, series="SNAIVE") +
autolayer(SES_deseas_totalperUS, PI=FALSE, series="SES") +
autolayer(SARIMA_forecast_totalperUS, PI=FALSE, series="SARIMA") +
autolayer(ARIMA_forecast_totalperUS, PI=FALSE, series="ARIMA") +
autolayer(ETS_totalperUS, PI=FALSE, series="ETS") +
autolayer(ARIMA_Four_for_totalperUS, PI=FALSE, series="ARIMA_FOURIER") +
autolayer(TBATS_for_totalperUS, PI=FALSE, series="TBATS") +
autolayer(NN_for_totalperUS, PI=FALSE, size=0.7, series="NNETAR") +
autolayer(SS_for_totalperUS, PI=FALSE, series="BSM") +
guides(colour=guide_legend(title="Forecast")) # SMA and SSES could not run

Step 5: Forecast for 2025 with the best three models for US
(Original)
Jisup: I think that SARIMA model cannot explain COVID. That’s why
the model is Forecast method: ARIMA(0,1,0). I tried several times but
SARIMA could not make any better.
# Set the forecasting period
n_fullUS = 12
# Create the time series to retain full data set
ts_US_fulltrain <- ts(US[,6],
start=c(year(US$Month[1]), month(US$Month[1])),
frequency = 12)
# Fit SS with BSM Model
SS_seas_totalper_fulltrainUS <- StructTS(ts_US_fulltrain,
type="BSM",fixed=c(0.1,0.01,0.1,NA))
SS_for_totalper_fulltrainUS <- forecast(SS_seas_totalper_fulltrainUS,h=n_fullUS)
# Plot model + observed data
autoplot(ts_US_fulltrain) +
autolayer(SS_for_totalper_fulltrainUS, series="SS with BSM Model",PI=FALSE)+
ylab("Forecasted Unemployment Rate (%) in US")

# Fit STL + ETS Model
ETS_totalper_fulltrainUS <- stlf(ts_US_fulltrain,h=n_fullUS)
# Plot model + observed data
autoplot(ts_US_fulltrain) +
autolayer(ETS_totalper_fulltrainUS, series="ETS",PI=FALSE)+
ylab("Forecasted Unemployment Rate (%) in US")

# Simple Exponential Smoothing Model
decom_totalper_fulltrainUS <- decompose(ts_US_fulltrain)
deseas_totalper_fulltrainUS <- seasadj(decom_totalper_fulltrainUS)
SES_deseas_totalper_fulltrainUS = ses( y = deseas_totalper_fulltrainUS,
h=n_fullUS, holdout = FALSE,
silent = FALSE)
# Plot model + observed data
autoplot(ts_US_fulltrain) +
autolayer(SES_deseas_totalper_fulltrainUS, series="SES Model",PI=FALSE)+
ylab("Forecasted Unemployment Rate (%) in US")

# SARIMA Model
autoplot(ts_US_fulltrain)

SARIMA_totalper_fulltrainUS <- auto.arima(ts_US_fulltrain)
summary(SARIMA_totalper_fulltrainUS)
## Series: ts_US_fulltrain
## ARIMA(0,1,0)
##
## sigma^2 = 0.5044: log likelihood = -309.03
## AIC=620.07 AICc=620.08 BIC=623.73
##
## Training set error measures:
## ME RMSE MAE MPE MAPE MASE
## Training set -0.003108681 0.7089948 0.3309191 -0.4788019 5.554759 0.3062833
## ACF1
## Training set 0.04163473
SARIMA_forecast_totalper_fulltrainUS <- forecast(object = SARIMA_totalper_fulltrainUS, h=n_fullUS)
summary(SARIMA_forecast_totalper_fulltrainUS)
##
## Forecast method: ARIMA(0,1,0)
##
## Model Information:
## Series: ts_US_fulltrain
## ARIMA(0,1,0)
##
## sigma^2 = 0.5044: log likelihood = -309.03
## AIC=620.07 AICc=620.08 BIC=623.73
##
## Error measures:
## ME RMSE MAE MPE MAPE MASE
## Training set -0.003108681 0.7089948 0.3309191 -0.4788019 5.554759 0.3062833
## ACF1
## Training set 0.04163473
##
## Forecasts:
## Point Forecast Lo 80 Hi 80 Lo 95 Hi 95
## Jan 2025 3.8 2.8898050 4.710195 2.4079768 5.192023
## Feb 2025 3.8 2.5127899 5.087210 1.8313820 5.768618
## Mar 2025 3.8 2.2234960 5.376504 1.3889452 6.211055
## Apr 2025 3.8 1.9796099 5.620390 1.0159537 6.584046
## May 2025 3.8 1.7647421 5.835258 0.6873416 6.912658
## Jun 2025 3.8 1.5704866 6.029513 0.3902535 7.209746
## Jul 2025 3.8 1.3918503 6.208150 0.1170529 7.482947
## Aug 2025 3.8 1.2255797 6.374420 -0.1372361 7.737236
## Sep 2025 3.8 1.0694149 6.530585 -0.3760695 7.976069
## Oct 2025 3.8 0.9217106 6.678289 -0.6019638 8.201964
## Nov 2025 3.8 0.7812246 6.818775 -0.8168185 8.416819
## Dec 2025 3.8 0.6469919 6.953008 -1.0221097 8.622110
# Plot model + observed data
autoplot(ts_US_fulltrain) +
autolayer(SARIMA_forecast_totalper_fulltrainUS, series="SARIMA Model",PI=FALSE)+
ylab("Forecasted Unemployment Rate (%) in US")

# Plot 4 models together
autoplot(ts_US_fulltrain) +
autolayer(SS_for_totalper_fulltrainUS, series="SS with BSM Model",PI=FALSE)+
autolayer(ETS_totalper_fulltrainUS, series="ETS",PI=FALSE)+
autolayer(SES_deseas_totalper_fulltrainUS, series="SES Model",PI=FALSE)+
autolayer(SARIMA_forecast_totalper_fulltrainUS, series="SARIMA Model",PI=FALSE)+
ylab("Unemployment Rate (%)") +
ggtitle("Forecasted Unemployment Rate (%) in US")

Time Series Analysis for US (Outliers-removed Series)
Step 2: Decompose the time series for US (Outliers)
# Decompose
decom_totalper_trainUSout <- decompose(ts_US_trainout)
plot(decom_totalper_trainUSout)

# Deseason
deseas_totalper_trainUSout <- seasadj(decom_totalper_trainUSout)
plot(deseas_totalper_trainUSout)

# Run the tests on deseasoned series
print(adf.test(deseas_totalper_trainUSout, alternative = "stationary")) # It is unit root.
##
## Augmented Dickey-Fuller Test
##
## data: deseas_totalper_trainUSout
## Dickey-Fuller = -2.245, Lag order = 6, p-value = 0.4728
## alternative hypothesis: stationary
summary(MannKendall(deseas_totalper_trainUSout)) # It has a decreasing trend.
## Score = -11564 , Var(Score) = 2348637
## denominator = 37927.99
## tau = -0.305, 2-sided pvalue =4.5209e-14
# Run the tests on original series
print(adf.test(ts_US_trainout, alternative = "stationary")) # It is unit out.
##
## Augmented Dickey-Fuller Test
##
## data: ts_US_trainout
## Dickey-Fuller = -2.023, Lag order = 6, p-value = 0.5664
## alternative hypothesis: stationary
summary(SeasonalMannKendall(ts_US_trainout))
## Score = -944 , Var(Score) = 17157.33
## denominator = 3013.905
## tau = -0.313, 2-sided pvalue =5.7243e-13
summary(smk.test(ts_US_trainout)) # It has seasonality.
##
## Seasonal Mann-Kendall trend test (Hirsch-Slack test)
##
## data: ts_US_trainout
## alternative hypothesis: two.sided
##
## Statistics for individual seasons
##
## H0
## S varS tau z Pr(>|z|)
## Season 1: S = 0 -77 1429.7 -0.307 -2.010 0.0444311 *
## Season 2: S = 0 -83 1429.7 -0.331 -2.169 0.0301066 *
## Season 3: S = 0 -81 1429.0 -0.323 -2.116 0.0343207 *
## Season 4: S = 0 -79 1431.7 -0.313 -2.061 0.0392597 *
## Season 5: S = 0 -69 1427.0 -0.276 -1.800 0.0718447 .
## Season 6: S = 0 -71 1429.0 -0.283 -1.852 0.0640620 .
## Season 7: S = 0 -68 1430.7 -0.270 -1.771 0.0765017 .
## Season 8: S = 0 -63 1429.7 -0.251 -1.640 0.1010598
## Season 9: S = 0 -72 1430.7 -0.286 -1.877 0.0605034 .
## Season 10: S = 0 -81 1433.7 -0.320 -2.113 0.0346148 *
## Season 11: S = 0 -97 1429.7 -0.386 -2.539 0.0111186 *
## Season 12: S = 0 -103 1427.0 -0.412 -2.700 0.0069308 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
# Check for any differencing needed
print(ndiffs(ts_US_trainout))
## [1] 1
print(ndiffs(deseas_totalper_trainUSout))
## [1] 1
Step 3: Test Time Series Models for US (Outliers)
# Seasonal Naive Model
SNAIVE_deseas_totalperUSout <- snaive(ts_US_trainout, h=n_forUS)
autoplot(SNAIVE_deseas_totalperUSout)

checkresiduals(SNAIVE_deseas_totalperUSout)

##
## Ljung-Box test
##
## data: Residuals from Seasonal naive method
## Q* = 1369.4, df = 24, p-value < 2.2e-16
##
## Model df: 0. Total lags used: 24
# Simple Moving Average Model
SMA_deseas_totalperUSout <- smooth::sma(y = deseas_totalper_trainUSout, h=n_forUS,
holdout = FALSE, silent = FALSE)
## Order 1 - -89.718; Order 100 - 1121.6598; Order 200 - 1115.5954
## Order 1 - -89.718; Order 50 - 1022.3643; Order 100 - 1121.6598
## Order 1 - -89.718; Order 25 - 854.3343; Order 50 - 1022.3643
## Order 1 - -89.718; Order 13 - 645.0819; Order 25 - 854.3343
## Order 1 - -89.718; Order 7 - 424.4507; Order 13 - 645.0819
## Order 1 - -89.718; Order 4 - 227.5604; Order 7 - 424.4507
## Order 1 - -89.718; Order 2 - 25.8563; Order 4 - 227.5604
## Order 12 - 616.949

summary(SMA_deseas_totalperUSout)
##
## Model estimated using sma() function: SMA(1)
## Response variable: y
## Distribution used in the estimation: Normal
## Loss function type: MSE; Loss function value: 0.042
## All coefficients were provided
## Error standard deviation: 0.2053
## Sample size: 276
## Number of estimated parameters: 1
## Number of degrees of freedom: 275
## Information criteria:
## AIC AICc BIC BICc
## -89.7326 -89.7180 -86.1122 -86.0712
checkresiduals(SMA_deseas_totalperUSout)

##
## Ljung-Box test
##
## data: Residuals
## Q* = 125.35, df = 24, p-value = 1.11e-15
##
## Model df: 0. Total lags used: 24
# Simple Exponential Smoothing Model
SES_deseas_totalperUSout = ses( y = deseas_totalper_trainUSout, h=n_forUS,
holdout = FALSE, silent = FALSE)
summary(SES_deseas_totalperUSout)
##
## Forecast method: Simple exponential smoothing
##
## Model Information:
## Simple exponential smoothing
##
## Call:
## ses(y = deseas_totalper_trainUSout, h = n_forUS, holdout = FALSE,
## silent = FALSE)
##
## Smoothing parameters:
## alpha = 0.9999
##
## Initial states:
## l = 4.2968
##
## sigma: 0.2057
##
## AIC AICc BIC
## 682.3091 682.3974 693.1703
##
## Error measures:
## ME RMSE MAE MPE MAPE MASE
## Training set -0.002000153 0.2049472 0.1502127 -0.1151749 2.706719 0.1638427
## ACF1
## Training set 0.2700784
##
## Forecasts:
## Point Forecast Lo 80 Hi 80 Lo 95 Hi 95
## Jan 2024 3.744812 3.481205 4.008419 3.341659 4.147965
## Feb 2024 3.744812 3.372034 4.117590 3.174697 4.314927
## Mar 2024 3.744812 3.288261 4.201363 3.046578 4.443046
## Apr 2024 3.744812 3.217637 4.271987 2.938567 4.551057
## May 2024 3.744812 3.155415 4.334209 2.843408 4.646216
## Jun 2024 3.744812 3.099162 4.390462 2.757376 4.732248
## Jul 2024 3.744812 3.047432 4.442192 2.678262 4.811362
## Aug 2024 3.744812 2.999283 4.490341 2.604624 4.885000
## Sep 2024 3.744812 2.954060 4.535564 2.535462 4.954162
## Oct 2024 3.744812 2.911288 4.578336 2.470046 5.019578
## Nov 2024 3.744812 2.870605 4.619019 2.407828 5.081796
## Dec 2024 3.744812 2.831733 4.657891 2.348379 5.141245
autoplot(SES_deseas_totalperUSout)

checkresiduals(SES_deseas_totalperUSout)

##
## Ljung-Box test
##
## data: Residuals from Simple exponential smoothing
## Q* = 124.87, df = 24, p-value = 1.332e-15
##
## Model df: 0. Total lags used: 24
# SARIMA Model
SARIMA_totalperUSout <- auto.arima(ts_US_trainout)
print(SARIMA_totalperUSout)
## Series: ts_US_trainout
## ARIMA(4,1,0)(1,1,1)[12]
##
## Coefficients:
## ar1 ar2 ar3 ar4 sar1 sma1
## 0.1386 0.2472 0.1517 0.0489 -0.0319 -0.8654
## s.e. 0.0614 0.0624 0.0617 0.0624 0.0741 0.0545
##
## sigma^2 = 0.03982: log likelihood = 45.01
## AIC=-76.03 AICc=-75.59 BIC=-51.02
SARIMA_forecast_totalperUSout <- forecast(object = SARIMA_totalperUSout, h=n_forUS)
autoplot(SARIMA_forecast_totalperUSout)

checkresiduals(SARIMA_forecast_totalperUSout) # Residuals are not iid.

##
## Ljung-Box test
##
## data: Residuals from ARIMA(4,1,0)(1,1,1)[12]
## Q* = 36.901, df = 18, p-value = 0.005399
##
## Model df: 6. Total lags used: 24
# Deaseasoned ARIMA Model
ARIMA_totalperUSout <- auto.arima(deseas_totalper_trainUSout, max.D = 0,
max.P = 0, max.Q = 0)
print(ARIMA_totalperUSout)
## Series: deseas_totalper_trainUSout
## ARIMA(1,1,2)
##
## Coefficients:
## ar1 ma1 ma2
## 0.7766 -0.6386 0.1651
## s.e. 0.0781 0.0931 0.0663
##
## sigma^2 = 0.03524: log likelihood = 71.16
## AIC=-134.31 AICc=-134.16 BIC=-119.85
ARIMA_forecast_totalperUSout <- forecast(object = ARIMA_totalperUSout, h=n_forUS)
autoplot(ARIMA_forecast_totalperUSout)

checkresiduals(ARIMA_forecast_totalperUSout) # Residuals are iid.

##
## Ljung-Box test
##
## data: Residuals from ARIMA(1,1,2)
## Q* = 33.924, df = 21, p-value = 0.03692
##
## Model df: 3. Total lags used: 24
# STL + ETS Model
ETS_totalperUSout <- stlf(ts_US_trainout,h=n_forUS)
autoplot(ETS_totalperUSout)

checkresiduals(ETS_totalperUSout) # Residuals are not iid.

##
## Ljung-Box test
##
## data: Residuals from STL + ETS(M,Ad,N)
## Q* = 65.36, df = 24, p-value = 1.078e-05
##
## Model df: 0. Total lags used: 24
# ARIMA + FOURIER Model
ARIMA_Four_fit_totalperUSout <- auto.arima(ts_US_trainout,
seasonal=FALSE, lambda=0,
xreg=fourier(ts_US_trainout,
K=3))
ARIMA_Four_for_totalperUSout <- forecast(ARIMA_Four_fit_totalperUSout,
xreg=fourier(ts_US_trainout,
K=3, h=n_forUS),
h=n_forUS)
autoplot(ARIMA_Four_for_totalperUSout)

checkresiduals(ARIMA_Four_for_totalperUSout) # Residuals are not iid.

##
## Ljung-Box test
##
## data: Residuals from Regression with ARIMA(3,1,1) errors
## Q* = 76.526, df = 20, p-value = 1.514e-08
##
## Model df: 4. Total lags used: 24
# TBATS Model
TBATS_fit_totalperUSout <- tbats(ts_US_trainout)
TBATS_for_totalperUSout <- forecast(TBATS_fit_totalperUSout, h = n_forUS)
autoplot(TBATS_for_totalperUSout)

checkresiduals(TBATS_fit_totalperUSout) # Residuals are not iid.

##
## Ljung-Box test
##
## data: Residuals from TBATS
## Q* = 52.491, df = 24, p-value = 0.0006748
##
## Model df: 0. Total lags used: 24
# Neural Network Model
NN_fit_totalperUSout <- nnetar(ts_US_trainout,
p=3, P=0,
xreg=fourier(ts_US_trainout, K=3))
NN_for_totalperUSout <- forecast(NN_fit_totalperUSout,
h=n_forUS,
xreg=fourier(ts_US_trainout,
K=3,h=n_forUS))
autoplot(NN_for_totalperUSout)

checkresiduals(NN_fit_totalperUSout) # Residuals are not iid.

##
## Ljung-Box test
##
## data: Residuals from NNAR(3,5)
## Q* = 55.561, df = 24, p-value = 0.0002629
##
## Model df: 0. Total lags used: 24
## State Space Exponential Smoothing Model
SSES_seas_totalperUSout <- es(ts_US_trainout,
model="ZZZ", h=n_forUS, holdout=FALSE)
checkresiduals(SSES_seas_totalperUSout) # Residuals are not iid.

##
## Ljung-Box test
##
## data: Residuals
## Q* = 45.682, df = 24, p-value = 0.004832
##
## Model df: 0. Total lags used: 24
## State Space with BSM Model
SS_seas_totalperUSout <- StructTS(ts_US_trainout,
type="BSM",fixed=c(0.1,0.01,0.1,NA))
SS_for_totalperUSout <- forecast(SS_seas_totalperUSout,h=n_forUS)
plot(SS_for_totalperUSout)

checkresiduals(SS_seas_totalperUSout) # Residuals are not iid.

##
## Ljung-Box test
##
## data: Residuals from StructTS
## Q* = 124.91, df = 24, p-value = 1.332e-15
##
## Model df: 0. Total lags used: 24
Step 4: Performance check for US (Outliers)
# Check accuracy of the models
SANIVE_tpscoresUSout <- accuracy(SNAIVE_deseas_totalperUSout$mean,ts_US_testout)
SMA_tpscoresUSout <- accuracy(SMA_deseas_totalperUSout$forecast,ts_US_testout)
SES_tpscoresUSout <- accuracy(SES_deseas_totalperUSout$mean,ts_US_testout)
SARIMA_tpscoresUSout <- accuracy(SARIMA_forecast_totalperUSout$mean,ts_US_testout)
ARIMA_tpscoresUSout <- accuracy(ARIMA_forecast_totalperUSout$mean,ts_US_testout)
ETS_tpscoresUSout <- accuracy(ETS_totalperUSout$mean,ts_US_testout)
ARIMA_Four_tpscoresUSout <- accuracy(ARIMA_Four_for_totalperUSout$mean,ts_US_testout)
TBATS_tpscoresUSout <- accuracy(TBATS_for_totalperUSout$mean,ts_US_testout)
NN_tpscoresUSout <- accuracy(NN_for_totalperUSout$mean,ts_US_testout)
SSES_tpscoresUSout <- accuracy(SSES_seas_totalperUSout$forecast,ts_US_testout)
SS_tpscoresUSout <- accuracy(SS_for_totalperUSout$mean,ts_US_testout)
# Compare the matrix
tpscoresUSout <- as.data.frame(rbind(SANIVE_tpscoresUSout, SMA_tpscoresUSout,
SES_tpscoresUSout, SARIMA_tpscoresUSout, ARIMA_tpscoresUSout,
ETS_tpscoresUSout, ARIMA_Four_tpscoresUSout, TBATS_tpscoresUSout,
NN_tpscoresUSout, SSES_tpscoresUSout, SS_tpscoresUSout)) %>%
mutate(Average = rowMeans(., na.rm = TRUE))
row.names(tpscoresUSout) <- c("SNAIVE", "SMA", "SES", "SARIMA", "ARIMA",
"ETS", "ARIMA_FOURIER", "TBATS", "NNETAR",
"SSES", "BSM")
# Choose model with lowest error
best_model_index_tpUSout <- which.min(tpscoresUSout[,"Average"])
cat("The best model by Average is:", row.names(tpscoresUSout[best_model_index_tpUSout,]))
## The best model by Average is: BSM
# Create Tables
kbl(tpscoresUSout,
caption = "Forecast Accuracy for Unemployment Rate (%) Data",
digits = array(5,ncol(tpscoresUSout))) %>%
kable_styling(full_width = FALSE, position = "center") %>%
#highlight model with lowest RMSE
kable_styling(latex_options="striped", stripe_index = which.min(seas_scores[,"Average"]))
Forecast Accuracy for Unemployment Rate (%) Data
|
|
ME
|
RMSE
|
MAE
|
MPE
|
MAPE
|
ACF1
|
Theil’s U
|
Average
|
|
SNAIVE
|
0.38333
|
0.40620
|
0.38333
|
9.46470
|
9.46470
|
0.28333
|
1.35284
|
3.10549
|
|
SMA
|
0.27186
|
0.39196
|
0.32013
|
6.30418
|
7.67176
|
0.44396
|
1.24293
|
2.37811
|
|
SES
|
0.27185
|
0.39195
|
0.32013
|
6.30406
|
7.67168
|
0.44396
|
1.24292
|
2.37808
|
|
SARIMA
|
0.29883
|
0.35509
|
0.30781
|
7.39350
|
7.61242
|
0.58243
|
1.20095
|
2.53586
|
|
ARIMA
|
0.35322
|
0.45308
|
0.38245
|
8.34064
|
9.17576
|
0.46153
|
1.44837
|
2.94501
|
|
ETS
|
0.08226
|
0.13550
|
0.10280
|
2.00098
|
2.52297
|
0.12960
|
0.46055
|
0.77638
|
|
ARIMA_FOURIER
|
0.50944
|
0.55013
|
0.50944
|
12.48430
|
12.48430
|
0.55452
|
1.81939
|
4.13022
|
|
TBATS
|
0.44741
|
0.49729
|
0.44741
|
10.95395
|
10.95395
|
0.62666
|
1.65645
|
3.65473
|
|
NNETAR
|
0.06192
|
0.12128
|
0.09470
|
1.40104
|
2.27327
|
0.17642
|
0.39597
|
0.64637
|
|
SSES
|
0.41563
|
0.47631
|
0.42271
|
10.27258
|
10.44526
|
0.65134
|
1.60031
|
3.46916
|
|
BSM
|
-0.10734
|
0.14557
|
0.12043
|
-2.77658
|
3.06756
|
0.03085
|
0.49704
|
0.13965
|
# Plot everything together
autoplot(ts_US_testout) +
autolayer(SNAIVE_deseas_totalperUSout, PI=FALSE, series="SNAIVE") +
autolayer(SES_deseas_totalperUSout, PI=FALSE, series="SES") +
autolayer(SARIMA_forecast_totalperUSout, PI=FALSE, series="SARIMA") +
autolayer(ARIMA_forecast_totalperUSout, PI=FALSE, series="ARIMA") +
autolayer(ETS_totalperUSout, PI=FALSE, series="ETS") +
autolayer(ARIMA_Four_for_totalperUSout, PI=FALSE, series="ARIMA_FOURIER") +
autolayer(TBATS_for_totalperUSout, PI=FALSE, series="TBATS") +
autolayer(NN_for_totalperUSout, PI=FALSE, series="NNETAR") +
autolayer(SS_for_totalperUSout, PI=FALSE, series="BSM") +
guides(colour=guide_legend(title="Forecast")) # SMA and SSES could not run

Step 5: Forecast for 2025 with the best three models for US
(Outliers)
# Set the forecasting period
n_fullUS = 12
# Create the time series to retain full data set
ts_US_fulltrainout <- ts(ts_USout,
start=c(year(US$Month[1]), month(US$Month[1])),
frequency = 12)
# Fit SS with BSM Model
SS_seas_totalper_fulltrainUSout <- StructTS(ts_US_fulltrainout,
type="BSM",fixed=c(0.1,0.1,0.1,NA))
SS_for_totalper_fulltrainUSout <- forecast(SS_seas_totalper_fulltrainUSout,h=n_fullUS)
# Plot model + observed data
autoplot(ts_US_fulltrainout) +
autolayer(SS_for_totalper_fulltrainUSout, series="SS with BSM Model",PI=FALSE)+
ylab("Forecasted Unemployment Rate (%) in US")

# Fit Neural Network Model
NN_fit_totalper_fulltrainUSout <- nnetar(ts_US_fulltrainout,
p=3, P=0,
xreg=fourier(ts_US_fulltrainout, K=3))
NN_for_totalper_fulltrainUSout <- forecast(NN_fit_totalper_fulltrainUSout,
h=n_fullUS,
xreg=fourier(ts_US_fulltrainout,
K=3,h=n_fullUS))
# Plot model + observed data
autoplot(ts_US_fulltrainout) +
autolayer(NN_for_totalper_fulltrainUSout, series="NNETAR",PI=FALSE)+
ylab("Forecasted Unemployment Rate (%) in US")

# Fit STL + ETS Model
ETS_totalper_fulltrainUSout <- stlf(ts_US_fulltrainout,h=n_fullUS)
# Plot model + observed data
autoplot(ts_US_fulltrainout) +
autolayer(ETS_totalper_fulltrainUSout, series="ETS",PI=FALSE)+
ylab("Forecasted Unemployment Rate (%) in US")

# Plot 4 models together
autoplot(ts_US_fulltrain) +
autolayer(SS_for_totalper_fulltrainUSout, series="SS with BSM Model",PI=FALSE)+
autolayer(NN_for_totalper_fulltrainUSout, series="NNETAR",PI=FALSE)+
autolayer(ETS_totalper_fulltrainUSout, series="ETS",PI=FALSE)+
ylab("Unemployment Rate (%)") +
ggtitle("Forecasted Unemployment Rate (%) in US (Outliers Removed)")

Jisup: I added the average of 3 models above. I could not decide
which is the best. :)
# --- 1. Calculate the average forecast of the three models ---
# Extract predicted means
bsm_fc <- as.numeric(SS_for_totalper_fulltrainUSout$mean)
nnar_fc <- as.numeric(NN_for_totalper_fulltrainUSout$mean)
ets_fc <- as.numeric(ETS_totalper_fulltrainUSout$mean)
# Compute average forecast
avg_fc <- (bsm_fc + nnar_fc + ets_fc) / 3
# Create date index for forecast
start_date <- as.Date(paste0(end(US$Month)[1] + 1, "-", end(US$Month)[2], "-01")) # 1 month after last
forecast_dates <- seq(from = as.Date("2025-01-01"), by = "month", length.out = n_fullUS)
# Create forecast dataframe
forecast_df <- tibble(
Month = forecast_dates,
BSM = bsm_fc,
NNAR = nnar_fc,
ETS = ets_fc,
Avg_Forecast = avg_fc
)
# --- 2. Export forecast to Excel ---
write.xlsx(forecast_df, "../Forecast_Average_US.xlsx")
# --- 3. Plot actuals (from 2020) and forecast average ---
# Extract actual values after 2020
us_actual_2020on <- US %>%
filter(Month >= as.Date("2020-01-01")) %>%
select(Month, Actual = Total.Per)
# Combine with forecast
plot_df <- bind_rows(
tibble(Month = us_actual_2020on$Month,
Value = us_actual_2020on$Actual,
Type = "Actual"),
tibble(Month = forecast_dates,
Value = avg_fc,
Type = "Avg Forecast")
)
# --- 4. Plot actuals (from 2022) and forecast average ---
# Extract actual values after 2022
us_actual_2022on <- US %>%
filter(Month >= as.Date("2022-01-01")) %>%
select(Month, Actual = Total.Per)
# Combine with forecast
plot_df <- bind_rows(
tibble(Month = us_actual_2022on$Month,
Value = us_actual_2022on$Actual,
Type = "Actual"),
tibble(Month = forecast_dates,
Value = avg_fc,
Type = "Avg Forecast")
)
# Plot
ggplot(plot_df, aes(x = Month, y = Value, color = Type, group = 1)) +
geom_line(size = 1) +
scale_color_manual(values = c("Actual" = "black", "Avg Forecast" = "steelblue")) +
labs(
title = "US Unemployment Rate: Actual (2022~) + Avg Forecast (2025)",
y = "Unemployment Rate (%)",
x = "Month",
color = "Legend"
) +
theme_minimal()
